From 08119c361d1181b3e8f1abb429236e488a664753 Mon Sep 17 00:00:00 2001 From: Hans-Christoph Steiner Date: Tue, 13 Aug 2013 15:42:54 -0400 Subject: Imported Upstream version 2.2.1 --- src/os_unix.c | 684 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 604 insertions(+), 80 deletions(-) (limited to 'src/os_unix.c') diff --git a/src/os_unix.c b/src/os_unix.c index c0df66e..abc23a4 100644 --- a/src/os_unix.c +++ b/src/os_unix.c @@ -46,6 +46,13 @@ #include "sqliteInt.h" #if SQLITE_OS_UNIX /* This file is used on unix only */ +/* Use posix_fallocate() if it is available +*/ +#if !defined(HAVE_POSIX_FALLOCATE) \ + && (_XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L) +# define HAVE_POSIX_FALLOCATE 1 +#endif + /* ** There are various methods for file locking used for concurrency ** control: @@ -119,7 +126,7 @@ #include #include #include -#ifndef SQLITE_OMIT_WAL +#if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0 #include #endif @@ -218,6 +225,15 @@ struct unixFile { const char *zPath; /* Name of the file */ unixShm *pShm; /* Shared memory segment information */ int szChunk; /* Configured by FCNTL_CHUNK_SIZE */ + int nFetchOut; /* Number of outstanding xFetch refs */ + sqlite3_int64 mmapSize; /* Usable size of mapping at pMapRegion */ + sqlite3_int64 mmapSizeActual; /* Actual size of mapping at pMapRegion */ + sqlite3_int64 mmapSizeMax; /* Configured FCNTL_MMAP_SIZE value */ + void *pMapRegion; /* Memory mapped region */ +#ifdef __QNXNTO__ + int sectorSize; /* Device sector size */ + int deviceCharacteristics; /* Precomputed device characteristics */ +#endif #if SQLITE_ENABLE_LOCKING_STYLE int openFlags; /* The flags specified at open() */ #endif @@ -238,7 +254,9 @@ struct unixFile { unsigned char transCntrChng; /* True if the transaction counter changed */ unsigned char dbUpdate; /* True if any part of database file changed */ unsigned char inNormalWrite; /* True if in a normal write operation */ + #endif + #ifdef SQLITE_TEST /* In test mode, increase the size of this structure a bit so that ** it is larger than the struct CrashFile defined in test6.c. @@ -262,6 +280,7 @@ struct unixFile { #define UNIXFILE_DELETE 0x20 /* Delete on close */ #define UNIXFILE_URI 0x40 /* Filename might have query parameters */ #define UNIXFILE_NOLOCK 0x80 /* Do no file locking */ +#define UNIXFILE_WARNED 0x0100 /* verifyDbFile() warnings have been issued */ /* ** Include code that is common to all os_*.c files @@ -295,6 +314,17 @@ struct unixFile { #define threadid 0 #endif +/* +** HAVE_MREMAP defaults to true on Linux and false everywhere else. +*/ +#if !defined(HAVE_MREMAP) +# if defined(__linux__) && defined(_GNU_SOURCE) +# define HAVE_MREMAP 1 +# else +# define HAVE_MREMAP 0 +# endif +#endif + /* ** Different Unix systems declare open() in different ways. Same use ** open(const char*,int,mode_t). Others use open(const char*,int,...). @@ -326,7 +356,7 @@ static int openDirectory(const char*, int*); ** to all overrideable system calls. */ static struct unix_syscall { - const char *zName; /* Name of the sytem call */ + const char *zName; /* Name of the system call */ sqlite3_syscall_ptr pCurrent; /* Current value of the system call */ sqlite3_syscall_ptr pDefault; /* Default value */ } aSyscall[] = { @@ -401,11 +431,7 @@ static struct unix_syscall { #define osPwrite64 ((ssize_t(*)(int,const void*,size_t,off_t))\ aSyscall[13].pCurrent) -#if SQLITE_ENABLE_LOCKING_STYLE { "fchmod", (sqlite3_syscall_ptr)fchmod, 0 }, -#else - { "fchmod", (sqlite3_syscall_ptr)0, 0 }, -#endif #define osFchmod ((int(*)(int,mode_t))aSyscall[14].pCurrent) #if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE @@ -430,8 +456,18 @@ static struct unix_syscall { { "fchown", (sqlite3_syscall_ptr)posixFchown, 0 }, #define osFchown ((int(*)(int,uid_t,gid_t))aSyscall[20].pCurrent) - { "umask", (sqlite3_syscall_ptr)umask, 0 }, -#define osUmask ((mode_t(*)(mode_t))aSyscall[21].pCurrent) + { "mmap", (sqlite3_syscall_ptr)mmap, 0 }, +#define osMmap ((void*(*)(void*,size_t,int,int,int,off_t))aSyscall[21].pCurrent) + + { "munmap", (sqlite3_syscall_ptr)munmap, 0 }, +#define osMunmap ((void*(*)(void*,size_t))aSyscall[22].pCurrent) + +#if HAVE_MREMAP + { "mremap", (sqlite3_syscall_ptr)mremap, 0 }, +#else + { "mremap", (sqlite3_syscall_ptr)0, 0 }, +#endif +#define osMremap ((void*(*)(void*,size_t,size_t,int,...))aSyscall[23].pCurrent) }; /* End of the overrideable system calls */ @@ -537,14 +573,7 @@ static const char *unixNextSystemCall(sqlite3_vfs *p, const char *zName){ */ static int robust_open(const char *z, int f, mode_t m){ int fd; - mode_t m2; - mode_t origM = 0; - if( m==0 ){ - m2 = SQLITE_DEFAULT_FILE_PERMISSIONS; - }else{ - m2 = m; - origM = osUmask(0); - } + mode_t m2 = m ? m : SQLITE_DEFAULT_FILE_PERMISSIONS; do{ #if defined(O_CLOEXEC) fd = osOpen(z,f|O_CLOEXEC,m2); @@ -552,12 +581,20 @@ static int robust_open(const char *z, int f, mode_t m){ fd = osOpen(z,f,m2); #endif }while( fd<0 && errno==EINTR ); - if( m ){ - osUmask(origM); - } + if( fd>=0 ){ + if( m!=0 ){ + struct stat statbuf; + if( osFstat(fd, &statbuf)==0 + && statbuf.st_size==0 + && (statbuf.st_mode&0777)!=m + ){ + osFchmod(fd, m); + } + } #if defined(FD_CLOEXEC) && (!defined(O_CLOEXEC) || O_CLOEXEC==0) - if( fd>=0 ) osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC); + osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC); #endif + } return fd; } @@ -763,7 +800,6 @@ static int sqliteErrorFromPosixError(int posixError, int sqliteIOErr) { } - /****************************************************************************** ****************** Begin Unique File ID Utility Used By VxWorks *************** ** @@ -1099,7 +1135,6 @@ static int unixLogErrorAtLine( zErr = strerror(iErrno); #endif - assert( errcode!=SQLITE_OK ); if( zPath==0 ) zPath = ""; sqlite3_log(errcode, "os_unix.c:%d: (%d) %s(%s) - %s", @@ -1265,6 +1300,50 @@ static int findInodeInfo( } +/* +** Check a unixFile that is a database. Verify the following: +** +** (1) There is exactly one hard link on the file +** (2) The file is not a symbolic link +** (3) The file has not been renamed or unlinked +** +** Issue sqlite3_log(SQLITE_WARNING,...) messages if anything is not right. +*/ +static void verifyDbFile(unixFile *pFile){ + struct stat buf; + int rc; + if( pFile->ctrlFlags & UNIXFILE_WARNED ){ + /* One or more of the following warnings have already been issued. Do not + ** repeat them so as not to clutter the error log */ + return; + } + rc = osFstat(pFile->h, &buf); + if( rc!=0 ){ + sqlite3_log(SQLITE_WARNING, "cannot fstat db file %s", pFile->zPath); + pFile->ctrlFlags |= UNIXFILE_WARNED; + return; + } + if( buf.st_nlink==0 && (pFile->ctrlFlags & UNIXFILE_DELETE)==0 ){ + sqlite3_log(SQLITE_WARNING, "file unlinked while open: %s", pFile->zPath); + pFile->ctrlFlags |= UNIXFILE_WARNED; + return; + } + if( buf.st_nlink>1 ){ + sqlite3_log(SQLITE_WARNING, "multiple links to file: %s", pFile->zPath); + pFile->ctrlFlags |= UNIXFILE_WARNED; + return; + } + if( pFile->pInode!=0 + && ((rc = osStat(pFile->zPath, &buf))!=0 + || buf.st_ino!=pFile->pInode->fileId.ino) + ){ + sqlite3_log(SQLITE_WARNING, "file renamed while open: %s", pFile->zPath); + pFile->ctrlFlags |= UNIXFILE_WARNED; + return; + } +} + + /* ** This routine checks if there is a RESERVED lock held on the specified ** file by this or any other process. If such a lock is held, set *pResOut @@ -1795,9 +1874,13 @@ end_unlock: ** the requested locking level, this routine is a no-op. */ static int unixUnlock(sqlite3_file *id, int eFileLock){ + assert( eFileLock==SHARED_LOCK || ((unixFile *)id)->nFetchOut==0 ); return posixUnlock(id, eFileLock, 0); } +static int unixMapfile(unixFile *pFd, i64 nByte); +static void unixUnmapfile(unixFile *pFd); + /* ** This function performs the parts of the "close file" operation ** common to all locking schemes. It closes the directory and file @@ -1810,6 +1893,7 @@ static int unixUnlock(sqlite3_file *id, int eFileLock){ */ static int closeUnixFile(sqlite3_file *id){ unixFile *pFile = (unixFile*)id; + unixUnmapfile(pFile); if( pFile->h>=0 ){ robust_close(pFile, pFile->h, __LINE__); pFile->h = -1; @@ -1836,6 +1920,7 @@ static int closeUnixFile(sqlite3_file *id){ static int unixClose(sqlite3_file *id){ int rc = SQLITE_OK; unixFile *pFile = (unixFile *)id; + verifyDbFile(pFile); unixUnlock(id, NO_LOCK); unixEnterMutex(); @@ -1904,7 +1989,7 @@ static int nolockClose(sqlite3_file *id) { /****************************************************************************** ************************* Begin dot-file Locking ****************************** ** -** The dotfile locking implementation uses the existance of separate lock +** The dotfile locking implementation uses the existence of separate lock ** files (really a directory) to control access to the database. This works ** on just about every filesystem imaginable. But there are serious downsides: ** @@ -1919,7 +2004,7 @@ static int nolockClose(sqlite3_file *id) { ** ** Dotfile locking works by creating a subdirectory in the same directory as ** the database and with the same name but with a ".lock" extension added. -** The existance of a lock directory implies an EXCLUSIVE lock. All other +** The existence of a lock directory implies an EXCLUSIVE lock. All other ** lock types (SHARED, RESERVED, PENDING) are mapped into EXCLUSIVE. */ @@ -2086,13 +2171,13 @@ static int dotlockUnlock(sqlite3_file *id, int eFileLock) { ** Close a file. Make sure the lock has been released before closing. */ static int dotlockClose(sqlite3_file *id) { - int rc; + int rc = SQLITE_OK; if( id ){ unixFile *pFile = (unixFile*)id; dotlockUnlock(id, NO_LOCK); sqlite3_free(pFile->lockingContext); + rc = closeUnixFile(id); } - rc = closeUnixFile(id); return rc; } /****************** End of the dot-file lock implementation ******************* @@ -2296,10 +2381,12 @@ static int flockUnlock(sqlite3_file *id, int eFileLock) { ** Close a file. */ static int flockClose(sqlite3_file *id) { + int rc = SQLITE_OK; if( id ){ flockUnlock(id, NO_LOCK); + rc = closeUnixFile(id); } - return closeUnixFile(id); + return rc; } #endif /* SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORK */ @@ -3010,6 +3097,8 @@ static int seekAndRead(unixFile *id, sqlite3_int64 offset, void *pBuf, int cnt){ i64 newOffset; #endif TIMER_START; + assert( cnt==(cnt&0x1ffff) ); + cnt &= 0x1ffff; do{ #if defined(USE_PREAD) got = osPread(id->h, pBuf, cnt, offset); @@ -3063,6 +3152,8 @@ static int unixRead( unixFile *pFile = (unixFile *)id; int got; assert( id ); + assert( offset>=0 ); + assert( amt>0 ); /* If this is a database file (not a journal, master-journal or temp ** file), the bytes in the locking range should never be read or written. */ @@ -3073,6 +3164,23 @@ static int unixRead( ); #endif +#if SQLITE_MAX_MMAP_SIZE>0 + /* Deal with as much of this read request as possible by transfering + ** data from the memory mapping using memcpy(). */ + if( offsetmmapSize ){ + if( offset+amt <= pFile->mmapSize ){ + memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], amt); + return SQLITE_OK; + }else{ + int nCopy = pFile->mmapSize - offset; + memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], nCopy); + pBuf = &((u8 *)pBuf)[nCopy]; + amt -= nCopy; + offset += nCopy; + } + } +#endif + got = seekAndRead(pFile, offset, pBuf, amt); if( got==amt ){ return SQLITE_OK; @@ -3088,44 +3196,59 @@ static int unixRead( } /* -** Seek to the offset in id->offset then read cnt bytes into pBuf. -** Return the number of bytes actually read. Update the offset. -** -** To avoid stomping the errno value on a failed write the lastErrno value -** is set before returning. +** Attempt to seek the file-descriptor passed as the first argument to +** absolute offset iOff, then attempt to write nBuf bytes of data from +** pBuf to it. If an error occurs, return -1 and set *piErrno. Otherwise, +** return the actual number of bytes written (which may be less than +** nBuf). */ -static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){ - int got; -#if (!defined(USE_PREAD) && !defined(USE_PREAD64)) - i64 newOffset; -#endif +static int seekAndWriteFd( + int fd, /* File descriptor to write to */ + i64 iOff, /* File offset to begin writing at */ + const void *pBuf, /* Copy data from this buffer to the file */ + int nBuf, /* Size of buffer pBuf in bytes */ + int *piErrno /* OUT: Error number if error occurs */ +){ + int rc = 0; /* Value returned by system call */ + + assert( nBuf==(nBuf&0x1ffff) ); + nBuf &= 0x1ffff; TIMER_START; + #if defined(USE_PREAD) - do{ got = osPwrite(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR ); + do{ rc = osPwrite(fd, pBuf, nBuf, iOff); }while( rc<0 && errno==EINTR ); #elif defined(USE_PREAD64) - do{ got = osPwrite64(id->h, pBuf, cnt, offset);}while( got<0 && errno==EINTR); + do{ rc = osPwrite64(fd, pBuf, nBuf, iOff);}while( rc<0 && errno==EINTR); #else do{ - newOffset = lseek(id->h, offset, SEEK_SET); - SimulateIOError( newOffset-- ); - if( newOffset!=offset ){ - if( newOffset == -1 ){ - ((unixFile*)id)->lastErrno = errno; - }else{ - ((unixFile*)id)->lastErrno = 0; - } + i64 iSeek = lseek(fd, iOff, SEEK_SET); + SimulateIOError( iSeek-- ); + + if( iSeek!=iOff ){ + if( piErrno ) *piErrno = (iSeek==-1 ? errno : 0); return -1; } - got = osWrite(id->h, pBuf, cnt); - }while( got<0 && errno==EINTR ); + rc = osWrite(fd, pBuf, nBuf); + }while( rc<0 && errno==EINTR ); #endif + TIMER_END; - if( got<0 ){ - ((unixFile*)id)->lastErrno = errno; - } + OSTRACE(("WRITE %-3d %5d %7lld %llu\n", fd, rc, iOff, TIMER_ELAPSED)); - OSTRACE(("WRITE %-3d %5d %7lld %llu\n", id->h, got, offset, TIMER_ELAPSED)); - return got; + if( rc<0 && piErrno ) *piErrno = errno; + return rc; +} + + +/* +** Seek to the offset in id->offset then read cnt bytes into pBuf. +** Return the number of bytes actually read. Update the offset. +** +** To avoid stomping the errno value on a failed write the lastErrno value +** is set before returning. +*/ +static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){ + return seekAndWriteFd(id->h, offset, pBuf, cnt, &id->lastErrno); } @@ -3175,6 +3298,23 @@ static int unixWrite( } #endif +#if SQLITE_MAX_MMAP_SIZE>0 + /* Deal with as much of this write request as possible by transfering + ** data from the memory mapping using memcpy(). */ + if( offsetmmapSize ){ + if( offset+amt <= pFile->mmapSize ){ + memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, amt); + return SQLITE_OK; + }else{ + int nCopy = pFile->mmapSize - offset; + memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, nCopy); + pBuf = &((u8 *)pBuf)[nCopy]; + amt -= nCopy; + offset += nCopy; + } + } +#endif + while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){ amt -= wrote; offset += wrote; @@ -3402,7 +3542,7 @@ static int unixSync(sqlite3_file *id, int flags){ } /* Also fsync the directory containing the file if the DIRSYNC flag - ** is set. This is a one-time occurrance. Many systems (examples: AIX) + ** is set. This is a one-time occurrence. Many systems (examples: AIX) ** are unable to fsync a directory, so ignore errors on the fsync. */ if( pFile->ctrlFlags & UNIXFILE_DIRSYNC ){ @@ -3457,6 +3597,14 @@ static int unixTruncate(sqlite3_file *id, i64 nByte){ } #endif + /* If the file was just truncated to a size smaller than the currently + ** mapped region, reduce the effective mapping size as well. SQLite will + ** use read() and write() to access data beyond this point from now on. + */ + if( nBytemmapSize ){ + pFile->mmapSize = nByte; + } + return SQLITE_OK; } } @@ -3545,6 +3693,19 @@ static int fcntlSizeHint(unixFile *pFile, i64 nByte){ } } + if( pFile->mmapSizeMax>0 && nByte>pFile->mmapSize ){ + int rc; + if( pFile->szChunk<=0 ){ + if( robust_ftruncate(pFile->h, nByte) ){ + pFile->lastErrno = errno; + return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath); + } + } + + rc = unixMapfile(pFile, nByte); + return rc; + } + return SQLITE_OK; } @@ -3564,6 +3725,9 @@ static void unixModeBit(unixFile *pFile, unsigned char mask, int *pArg){ } } +/* Forward declaration */ +static int unixGetTempname(int nBuf, char *zBuf); + /* ** Information and control of an open file handle. */ @@ -3601,6 +3765,26 @@ static int unixFileControl(sqlite3_file *id, int op, void *pArg){ *(char**)pArg = sqlite3_mprintf("%s", pFile->pVfs->zName); return SQLITE_OK; } + case SQLITE_FCNTL_TEMPFILENAME: { + char *zTFile = sqlite3_malloc( pFile->pVfs->mxPathname ); + if( zTFile ){ + unixGetTempname(pFile->pVfs->mxPathname, zTFile); + *(char**)pArg = zTFile; + } + return SQLITE_OK; + } + case SQLITE_FCNTL_MMAP_SIZE: { + i64 newLimit = *(i64*)pArg; + if( newLimit>sqlite3GlobalConfig.mxMmap ){ + newLimit = sqlite3GlobalConfig.mxMmap; + } + *(i64*)pArg = pFile->mmapSizeMax; + if( newLimit>=0 ){ + pFile->mmapSizeMax = newLimit; + if( newLimitmmapSize ) pFile->mmapSize = newLimit; + } + return SQLITE_OK; + } #ifdef SQLITE_DEBUG /* The pager calls this method to signal that it has done ** a rollback and that the database is therefore unchanged and @@ -3632,10 +3816,92 @@ static int unixFileControl(sqlite3_file *id, int op, void *pArg){ ** a database and its journal file) that the sector size will be the ** same for both. */ -static int unixSectorSize(sqlite3_file *pFile){ - (void)pFile; +#ifndef __QNXNTO__ +static int unixSectorSize(sqlite3_file *NotUsed){ + UNUSED_PARAMETER(NotUsed); return SQLITE_DEFAULT_SECTOR_SIZE; } +#endif + +/* +** The following version of unixSectorSize() is optimized for QNX. +*/ +#ifdef __QNXNTO__ +#include +#include +static int unixSectorSize(sqlite3_file *id){ + unixFile *pFile = (unixFile*)id; + if( pFile->sectorSize == 0 ){ + struct statvfs fsInfo; + + /* Set defaults for non-supported filesystems */ + pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE; + pFile->deviceCharacteristics = 0; + if( fstatvfs(pFile->h, &fsInfo) == -1 ) { + return pFile->sectorSize; + } + + if( !strcmp(fsInfo.f_basetype, "tmp") ) { + pFile->sectorSize = fsInfo.f_bsize; + pFile->deviceCharacteristics = + SQLITE_IOCAP_ATOMIC4K | /* All ram filesystem writes are atomic */ + SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until + ** the write succeeds */ + SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind + ** so it is ordered */ + 0; + }else if( strstr(fsInfo.f_basetype, "etfs") ){ + pFile->sectorSize = fsInfo.f_bsize; + pFile->deviceCharacteristics = + /* etfs cluster size writes are atomic */ + (pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) | + SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until + ** the write succeeds */ + SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind + ** so it is ordered */ + 0; + }else if( !strcmp(fsInfo.f_basetype, "qnx6") ){ + pFile->sectorSize = fsInfo.f_bsize; + pFile->deviceCharacteristics = + SQLITE_IOCAP_ATOMIC | /* All filesystem writes are atomic */ + SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until + ** the write succeeds */ + SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind + ** so it is ordered */ + 0; + }else if( !strcmp(fsInfo.f_basetype, "qnx4") ){ + pFile->sectorSize = fsInfo.f_bsize; + pFile->deviceCharacteristics = + /* full bitset of atomics from max sector size and smaller */ + ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 | + SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind + ** so it is ordered */ + 0; + }else if( strstr(fsInfo.f_basetype, "dos") ){ + pFile->sectorSize = fsInfo.f_bsize; + pFile->deviceCharacteristics = + /* full bitset of atomics from max sector size and smaller */ + ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 | + SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind + ** so it is ordered */ + 0; + }else{ + pFile->deviceCharacteristics = + SQLITE_IOCAP_ATOMIC512 | /* blocks are atomic */ + SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until + ** the write succeeds */ + 0; + } + } + /* Last chance verification. If the sector size isn't a multiple of 512 + ** then it isn't valid.*/ + if( pFile->sectorSize % 512 != 0 ){ + pFile->deviceCharacteristics = 0; + pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE; + } + return pFile->sectorSize; +} +#endif /* __QNXNTO__ */ /* ** Return the device characteristics for the file. @@ -3652,11 +3918,15 @@ static int unixSectorSize(sqlite3_file *pFile){ */ static int unixDeviceCharacteristics(sqlite3_file *id){ unixFile *p = (unixFile*)id; + int rc = 0; +#ifdef __QNXNTO__ + if( p->sectorSize==0 ) unixSectorSize(id); + rc = p->deviceCharacteristics; +#endif if( p->ctrlFlags & UNIXFILE_PSOW ){ - return SQLITE_IOCAP_POWERSAFE_OVERWRITE; - }else{ - return 0; + rc |= SQLITE_IOCAP_POWERSAFE_OVERWRITE; } + return rc; } #ifndef SQLITE_OMIT_WAL @@ -3827,7 +4097,7 @@ static void unixShmPurge(unixFile *pFd){ sqlite3_mutex_free(p->mutex); for(i=0; inRegion; i++){ if( p->h>=0 ){ - munmap(p->apRegion[i], p->szRegion); + osMunmap(p->apRegion[i], p->szRegion); }else{ sqlite3_free(p->apRegion[i]); } @@ -4067,16 +4337,32 @@ static int unixShmMap( if( sStat.st_sizeh, nByte) ){ - rc = unixLogError(SQLITE_IOERR_SHMSIZE, "ftruncate", - pShmNode->zFilename); + if( !bExtend ){ goto shmpage_out; } + + /* Alternatively, if bExtend is true, extend the file. Do this by + ** writing a single byte to the end of each (OS) page being + ** allocated or extended. Technically, we need only write to the + ** last page in order to extend the file. But writing to all new + ** pages forces the OS to allocate them immediately, which reduces + ** the chances of SIGBUS while accessing the mapped region later on. + */ + else{ + static const int pgsz = 4096; + int iPg; + + /* Write to the last byte of each newly allocated or extended page */ + assert( (nByte % pgsz)==0 ); + for(iPg=(sStat.st_size/pgsz); iPg<(nByte/pgsz); iPg++){ + if( seekAndWriteFd(pShmNode->h, iPg*pgsz + pgsz-1, "", 1, 0)!=1 ){ + const char *zFile = pShmNode->zFilename; + rc = unixLogError(SQLITE_IOERR_SHMSIZE, "write", zFile); + goto shmpage_out; + } + } + } } } @@ -4092,9 +4378,9 @@ static int unixShmMap( while(pShmNode->nRegion<=iRegion){ void *pMem; if( pShmNode->h>=0 ){ - pMem = mmap(0, szRegion, + pMem = osMmap(0, szRegion, pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE, - MAP_SHARED, pShmNode->h, pShmNode->nRegion*szRegion + MAP_SHARED, pShmNode->h, szRegion*(i64)pShmNode->nRegion ); if( pMem==MAP_FAILED ){ rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename); @@ -4309,6 +4595,236 @@ static int unixShmUnmap( # define unixShmUnmap 0 #endif /* #ifndef SQLITE_OMIT_WAL */ +/* +** If it is currently memory mapped, unmap file pFd. +*/ +static void unixUnmapfile(unixFile *pFd){ + assert( pFd->nFetchOut==0 ); +#if SQLITE_MAX_MMAP_SIZE>0 + if( pFd->pMapRegion ){ + osMunmap(pFd->pMapRegion, pFd->mmapSizeActual); + pFd->pMapRegion = 0; + pFd->mmapSize = 0; + pFd->mmapSizeActual = 0; + } +#endif +} + +#if SQLITE_MAX_MMAP_SIZE>0 +/* +** Return the system page size. +*/ +static int unixGetPagesize(void){ +#if HAVE_MREMAP + return 512; +#elif defined(_BSD_SOURCE) + return getpagesize(); +#else + return (int)sysconf(_SC_PAGESIZE); +#endif +} +#endif /* SQLITE_MAX_MMAP_SIZE>0 */ + +#if SQLITE_MAX_MMAP_SIZE>0 +/* +** Attempt to set the size of the memory mapping maintained by file +** descriptor pFd to nNew bytes. Any existing mapping is discarded. +** +** If successful, this function sets the following variables: +** +** unixFile.pMapRegion +** unixFile.mmapSize +** unixFile.mmapSizeActual +** +** If unsuccessful, an error message is logged via sqlite3_log() and +** the three variables above are zeroed. In this case SQLite should +** continue accessing the database using the xRead() and xWrite() +** methods. +*/ +static void unixRemapfile( + unixFile *pFd, /* File descriptor object */ + i64 nNew /* Required mapping size */ +){ + const char *zErr = "mmap"; + int h = pFd->h; /* File descriptor open on db file */ + u8 *pOrig = (u8 *)pFd->pMapRegion; /* Pointer to current file mapping */ + i64 nOrig = pFd->mmapSizeActual; /* Size of pOrig region in bytes */ + u8 *pNew = 0; /* Location of new mapping */ + int flags = PROT_READ; /* Flags to pass to mmap() */ + + assert( pFd->nFetchOut==0 ); + assert( nNew>pFd->mmapSize ); + assert( nNew<=pFd->mmapSizeMax ); + assert( nNew>0 ); + assert( pFd->mmapSizeActual>=pFd->mmapSize ); + assert( MAP_FAILED!=0 ); + + if( (pFd->ctrlFlags & UNIXFILE_RDONLY)==0 ) flags |= PROT_WRITE; + + if( pOrig ){ + const int szSyspage = unixGetPagesize(); + i64 nReuse = (pFd->mmapSize & ~(szSyspage-1)); + u8 *pReq = &pOrig[nReuse]; + + /* Unmap any pages of the existing mapping that cannot be reused. */ + if( nReuse!=nOrig ){ + osMunmap(pReq, nOrig-nReuse); + } + +#if HAVE_MREMAP + pNew = osMremap(pOrig, nReuse, nNew, MREMAP_MAYMOVE); + zErr = "mremap"; +#else + pNew = osMmap(pReq, nNew-nReuse, flags, MAP_SHARED, h, nReuse); + if( pNew!=MAP_FAILED ){ + if( pNew!=pReq ){ + osMunmap(pNew, nNew - nReuse); + pNew = 0; + }else{ + pNew = pOrig; + } + } +#endif + + /* The attempt to extend the existing mapping failed. Free it. */ + if( pNew==MAP_FAILED || pNew==0 ){ + osMunmap(pOrig, nReuse); + } + } + + /* If pNew is still NULL, try to create an entirely new mapping. */ + if( pNew==0 ){ + pNew = osMmap(0, nNew, flags, MAP_SHARED, h, 0); + } + + if( pNew==MAP_FAILED ){ + pNew = 0; + nNew = 0; + unixLogError(SQLITE_OK, zErr, pFd->zPath); + + /* If the mmap() above failed, assume that all subsequent mmap() calls + ** will probably fail too. Fall back to using xRead/xWrite exclusively + ** in this case. */ + pFd->mmapSizeMax = 0; + } + pFd->pMapRegion = (void *)pNew; + pFd->mmapSize = pFd->mmapSizeActual = nNew; +} +#endif + +/* +** Memory map or remap the file opened by file-descriptor pFd (if the file +** is already mapped, the existing mapping is replaced by the new). Or, if +** there already exists a mapping for this file, and there are still +** outstanding xFetch() references to it, this function is a no-op. +** +** If parameter nByte is non-negative, then it is the requested size of +** the mapping to create. Otherwise, if nByte is less than zero, then the +** requested size is the size of the file on disk. The actual size of the +** created mapping is either the requested size or the value configured +** using SQLITE_FCNTL_MMAP_LIMIT, whichever is smaller. +** +** SQLITE_OK is returned if no error occurs (even if the mapping is not +** recreated as a result of outstanding references) or an SQLite error +** code otherwise. +*/ +static int unixMapfile(unixFile *pFd, i64 nByte){ +#if SQLITE_MAX_MMAP_SIZE>0 + i64 nMap = nByte; + int rc; + + assert( nMap>=0 || pFd->nFetchOut==0 ); + if( pFd->nFetchOut>0 ) return SQLITE_OK; + + if( nMap<0 ){ + struct stat statbuf; /* Low-level file information */ + rc = osFstat(pFd->h, &statbuf); + if( rc!=SQLITE_OK ){ + return SQLITE_IOERR_FSTAT; + } + nMap = statbuf.st_size; + } + if( nMap>pFd->mmapSizeMax ){ + nMap = pFd->mmapSizeMax; + } + + if( nMap!=pFd->mmapSize ){ + if( nMap>0 ){ + unixRemapfile(pFd, nMap); + }else{ + unixUnmapfile(pFd); + } + } +#endif + + return SQLITE_OK; +} + +/* +** If possible, return a pointer to a mapping of file fd starting at offset +** iOff. The mapping must be valid for at least nAmt bytes. +** +** If such a pointer can be obtained, store it in *pp and return SQLITE_OK. +** Or, if one cannot but no error occurs, set *pp to 0 and return SQLITE_OK. +** Finally, if an error does occur, return an SQLite error code. The final +** value of *pp is undefined in this case. +** +** If this function does return a pointer, the caller must eventually +** release the reference by calling unixUnfetch(). +*/ +static int unixFetch(sqlite3_file *fd, i64 iOff, int nAmt, void **pp){ +#if SQLITE_MAX_MMAP_SIZE>0 + unixFile *pFd = (unixFile *)fd; /* The underlying database file */ +#endif + *pp = 0; + +#if SQLITE_MAX_MMAP_SIZE>0 + if( pFd->mmapSizeMax>0 ){ + if( pFd->pMapRegion==0 ){ + int rc = unixMapfile(pFd, -1); + if( rc!=SQLITE_OK ) return rc; + } + if( pFd->mmapSize >= iOff+nAmt ){ + *pp = &((u8 *)pFd->pMapRegion)[iOff]; + pFd->nFetchOut++; + } + } +#endif + return SQLITE_OK; +} + +/* +** If the third argument is non-NULL, then this function releases a +** reference obtained by an earlier call to unixFetch(). The second +** argument passed to this function must be the same as the corresponding +** argument that was passed to the unixFetch() invocation. +** +** Or, if the third argument is NULL, then this function is being called +** to inform the VFS layer that, according to POSIX, any existing mapping +** may now be invalid and should be unmapped. +*/ +static int unixUnfetch(sqlite3_file *fd, i64 iOff, void *p){ + unixFile *pFd = (unixFile *)fd; /* The underlying database file */ + UNUSED_PARAMETER(iOff); + + /* If p==0 (unmap the entire file) then there must be no outstanding + ** xFetch references. Or, if p!=0 (meaning it is an xFetch reference), + ** then there must be at least one outstanding. */ + assert( (p==0)==(pFd->nFetchOut==0) ); + + /* If p!=0, it must match the iOff value. */ + assert( p==0 || p==&((u8 *)pFd->pMapRegion)[iOff] ); + + if( p ){ + pFd->nFetchOut--; + }else{ + unixUnmapfile(pFd); + } + + assert( pFd->nFetchOut>=0 ); + return SQLITE_OK; +} + /* ** Here ends the implementation of all sqlite3_file methods. ** @@ -4367,7 +4883,9 @@ static const sqlite3_io_methods METHOD = { \ unixShmMap, /* xShmMap */ \ unixShmLock, /* xShmLock */ \ unixShmBarrier, /* xShmBarrier */ \ - unixShmUnmap /* xShmUnmap */ \ + unixShmUnmap, /* xShmUnmap */ \ + unixFetch, /* xFetch */ \ + unixUnfetch, /* xUnfetch */ \ }; \ static const sqlite3_io_methods *FINDER##Impl(const char *z, unixFile *p){ \ UNUSED_PARAMETER(z); UNUSED_PARAMETER(p); \ @@ -4384,7 +4902,7 @@ static const sqlite3_io_methods *(*const FINDER)(const char*,unixFile *p) \ IOMETHODS( posixIoFinder, /* Finder function name */ posixIoMethods, /* sqlite3_io_methods object name */ - 2, /* shared memory is enabled */ + 3, /* shared memory and mmap are enabled */ unixClose, /* xClose method */ unixLock, /* xLock method */ unixUnlock, /* xUnlock method */ @@ -4635,11 +5153,12 @@ static int fillInUnixFile( pNew->pVfs = pVfs; pNew->zPath = zFilename; pNew->ctrlFlags = (u8)ctrlFlags; + pNew->mmapSizeMax = sqlite3GlobalConfig.szMmap; if( sqlite3_uri_boolean(((ctrlFlags & UNIXFILE_URI) ? zFilename : 0), "psow", SQLITE_POWERSAFE_OVERWRITE) ){ pNew->ctrlFlags |= UNIXFILE_PSOW; } - if( memcmp(pVfs->zName,"unix-excl",10)==0 ){ + if( strcmp(pVfs->zName,"unix-excl")==0 ){ pNew->ctrlFlags |= UNIXFILE_EXCL; } @@ -4671,7 +5190,7 @@ static int fillInUnixFile( unixEnterMutex(); rc = findInodeInfo(pNew, &pNew->pInode); if( rc!=SQLITE_OK ){ - /* If an error occured in findInodeInfo(), close the file descriptor + /* If an error occurred in findInodeInfo(), close the file descriptor ** immediately, before releasing the mutex. findInodeInfo() may fail ** in two scenarios: ** @@ -4770,15 +5289,15 @@ static int fillInUnixFile( if( h>=0 ) robust_close(pNew, h, __LINE__); h = -1; osUnlink(zFilename); - isDelete = 0; + pNew->ctrlFlags |= UNIXFILE_DELETE; } - if( isDelete ) pNew->ctrlFlags |= UNIXFILE_DELETE; #endif if( rc!=SQLITE_OK ){ if( h>=0 ) robust_close(pNew, h, __LINE__); }else{ pNew->pMethod = pLockingStyle; OpenCounter(+1); + verifyDbFile(pNew); } return rc; } @@ -5278,8 +5797,13 @@ static int unixDelete( int rc = SQLITE_OK; UNUSED_PARAMETER(NotUsed); SimulateIOError(return SQLITE_IOERR_DELETE); - if( osUnlink(zPath)==(-1) && errno!=ENOENT ){ - return unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath); + if( osUnlink(zPath)==(-1) ){ + if( errno==ENOENT ){ + rc = SQLITE_IOERR_DELETE_NOENT; + }else{ + rc = unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath); + } + return rc; } #ifndef SQLITE_DISABLE_DIRSYNC if( (dirSync & 1)!=0 ){ @@ -5304,7 +5828,7 @@ static int unixDelete( } /* -** Test the existance of or access permissions of file zPath. The +** Test the existence of or access permissions of file zPath. The ** test performed depends on the value of flags: ** ** SQLITE_ACCESS_EXISTS: Return 1 if the file exists @@ -6867,7 +7391,7 @@ int sqlite3_os_init(void){ /* Double-check that the aSyscall[] array has been constructed ** correctly. See ticket [bb3a86e890c8e96ab] */ - assert( ArraySize(aSyscall)==22 ); + assert( ArraySize(aSyscall)==24 ); /* Register all VFSes defined in the aVfs[] array */ for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){ -- cgit v1.2.3