summaryrefslogtreecommitdiff
path: root/src/os_unix.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/os_unix.c')
-rw-r--r--src/os_unix.c684
1 files changed, 604 insertions, 80 deletions
diff --git a/src/os_unix.c b/src/os_unix.c
index c0df66e..abc23a4 100644
--- a/src/os_unix.c
+++ b/src/os_unix.c
@@ -46,6 +46,13 @@
#include "sqliteInt.h"
#if SQLITE_OS_UNIX /* This file is used on unix only */
+/* Use posix_fallocate() if it is available
+*/
+#if !defined(HAVE_POSIX_FALLOCATE) \
+ && (_XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L)
+# define HAVE_POSIX_FALLOCATE 1
+#endif
+
/*
** There are various methods for file locking used for concurrency
** control:
@@ -119,7 +126,7 @@
#include <time.h>
#include <sys/time.h>
#include <errno.h>
-#ifndef SQLITE_OMIT_WAL
+#if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
#include <sys/mman.h>
#endif
@@ -218,6 +225,15 @@ struct unixFile {
const char *zPath; /* Name of the file */
unixShm *pShm; /* Shared memory segment information */
int szChunk; /* Configured by FCNTL_CHUNK_SIZE */
+ int nFetchOut; /* Number of outstanding xFetch refs */
+ sqlite3_int64 mmapSize; /* Usable size of mapping at pMapRegion */
+ sqlite3_int64 mmapSizeActual; /* Actual size of mapping at pMapRegion */
+ sqlite3_int64 mmapSizeMax; /* Configured FCNTL_MMAP_SIZE value */
+ void *pMapRegion; /* Memory mapped region */
+#ifdef __QNXNTO__
+ int sectorSize; /* Device sector size */
+ int deviceCharacteristics; /* Precomputed device characteristics */
+#endif
#if SQLITE_ENABLE_LOCKING_STYLE
int openFlags; /* The flags specified at open() */
#endif
@@ -238,7 +254,9 @@ struct unixFile {
unsigned char transCntrChng; /* True if the transaction counter changed */
unsigned char dbUpdate; /* True if any part of database file changed */
unsigned char inNormalWrite; /* True if in a normal write operation */
+
#endif
+
#ifdef SQLITE_TEST
/* In test mode, increase the size of this structure a bit so that
** it is larger than the struct CrashFile defined in test6.c.
@@ -262,6 +280,7 @@ struct unixFile {
#define UNIXFILE_DELETE 0x20 /* Delete on close */
#define UNIXFILE_URI 0x40 /* Filename might have query parameters */
#define UNIXFILE_NOLOCK 0x80 /* Do no file locking */
+#define UNIXFILE_WARNED 0x0100 /* verifyDbFile() warnings have been issued */
/*
** Include code that is common to all os_*.c files
@@ -296,6 +315,17 @@ struct unixFile {
#endif
/*
+** HAVE_MREMAP defaults to true on Linux and false everywhere else.
+*/
+#if !defined(HAVE_MREMAP)
+# if defined(__linux__) && defined(_GNU_SOURCE)
+# define HAVE_MREMAP 1
+# else
+# define HAVE_MREMAP 0
+# endif
+#endif
+
+/*
** Different Unix systems declare open() in different ways. Same use
** open(const char*,int,mode_t). Others use open(const char*,int,...).
** The difference is important when using a pointer to the function.
@@ -326,7 +356,7 @@ static int openDirectory(const char*, int*);
** to all overrideable system calls.
*/
static struct unix_syscall {
- const char *zName; /* Name of the sytem call */
+ const char *zName; /* Name of the system call */
sqlite3_syscall_ptr pCurrent; /* Current value of the system call */
sqlite3_syscall_ptr pDefault; /* Default value */
} aSyscall[] = {
@@ -401,11 +431,7 @@ static struct unix_syscall {
#define osPwrite64 ((ssize_t(*)(int,const void*,size_t,off_t))\
aSyscall[13].pCurrent)
-#if SQLITE_ENABLE_LOCKING_STYLE
{ "fchmod", (sqlite3_syscall_ptr)fchmod, 0 },
-#else
- { "fchmod", (sqlite3_syscall_ptr)0, 0 },
-#endif
#define osFchmod ((int(*)(int,mode_t))aSyscall[14].pCurrent)
#if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
@@ -430,8 +456,18 @@ static struct unix_syscall {
{ "fchown", (sqlite3_syscall_ptr)posixFchown, 0 },
#define osFchown ((int(*)(int,uid_t,gid_t))aSyscall[20].pCurrent)
- { "umask", (sqlite3_syscall_ptr)umask, 0 },
-#define osUmask ((mode_t(*)(mode_t))aSyscall[21].pCurrent)
+ { "mmap", (sqlite3_syscall_ptr)mmap, 0 },
+#define osMmap ((void*(*)(void*,size_t,int,int,int,off_t))aSyscall[21].pCurrent)
+
+ { "munmap", (sqlite3_syscall_ptr)munmap, 0 },
+#define osMunmap ((void*(*)(void*,size_t))aSyscall[22].pCurrent)
+
+#if HAVE_MREMAP
+ { "mremap", (sqlite3_syscall_ptr)mremap, 0 },
+#else
+ { "mremap", (sqlite3_syscall_ptr)0, 0 },
+#endif
+#define osMremap ((void*(*)(void*,size_t,size_t,int,...))aSyscall[23].pCurrent)
}; /* End of the overrideable system calls */
@@ -537,14 +573,7 @@ static const char *unixNextSystemCall(sqlite3_vfs *p, const char *zName){
*/
static int robust_open(const char *z, int f, mode_t m){
int fd;
- mode_t m2;
- mode_t origM = 0;
- if( m==0 ){
- m2 = SQLITE_DEFAULT_FILE_PERMISSIONS;
- }else{
- m2 = m;
- origM = osUmask(0);
- }
+ mode_t m2 = m ? m : SQLITE_DEFAULT_FILE_PERMISSIONS;
do{
#if defined(O_CLOEXEC)
fd = osOpen(z,f|O_CLOEXEC,m2);
@@ -552,12 +581,20 @@ static int robust_open(const char *z, int f, mode_t m){
fd = osOpen(z,f,m2);
#endif
}while( fd<0 && errno==EINTR );
- if( m ){
- osUmask(origM);
- }
+ if( fd>=0 ){
+ if( m!=0 ){
+ struct stat statbuf;
+ if( osFstat(fd, &statbuf)==0
+ && statbuf.st_size==0
+ && (statbuf.st_mode&0777)!=m
+ ){
+ osFchmod(fd, m);
+ }
+ }
#if defined(FD_CLOEXEC) && (!defined(O_CLOEXEC) || O_CLOEXEC==0)
- if( fd>=0 ) osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
+ osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
#endif
+ }
return fd;
}
@@ -763,7 +800,6 @@ static int sqliteErrorFromPosixError(int posixError, int sqliteIOErr) {
}
-
/******************************************************************************
****************** Begin Unique File ID Utility Used By VxWorks ***************
**
@@ -1099,7 +1135,6 @@ static int unixLogErrorAtLine(
zErr = strerror(iErrno);
#endif
- assert( errcode!=SQLITE_OK );
if( zPath==0 ) zPath = "";
sqlite3_log(errcode,
"os_unix.c:%d: (%d) %s(%s) - %s",
@@ -1266,6 +1301,50 @@ static int findInodeInfo(
/*
+** Check a unixFile that is a database. Verify the following:
+**
+** (1) There is exactly one hard link on the file
+** (2) The file is not a symbolic link
+** (3) The file has not been renamed or unlinked
+**
+** Issue sqlite3_log(SQLITE_WARNING,...) messages if anything is not right.
+*/
+static void verifyDbFile(unixFile *pFile){
+ struct stat buf;
+ int rc;
+ if( pFile->ctrlFlags & UNIXFILE_WARNED ){
+ /* One or more of the following warnings have already been issued. Do not
+ ** repeat them so as not to clutter the error log */
+ return;
+ }
+ rc = osFstat(pFile->h, &buf);
+ if( rc!=0 ){
+ sqlite3_log(SQLITE_WARNING, "cannot fstat db file %s", pFile->zPath);
+ pFile->ctrlFlags |= UNIXFILE_WARNED;
+ return;
+ }
+ if( buf.st_nlink==0 && (pFile->ctrlFlags & UNIXFILE_DELETE)==0 ){
+ sqlite3_log(SQLITE_WARNING, "file unlinked while open: %s", pFile->zPath);
+ pFile->ctrlFlags |= UNIXFILE_WARNED;
+ return;
+ }
+ if( buf.st_nlink>1 ){
+ sqlite3_log(SQLITE_WARNING, "multiple links to file: %s", pFile->zPath);
+ pFile->ctrlFlags |= UNIXFILE_WARNED;
+ return;
+ }
+ if( pFile->pInode!=0
+ && ((rc = osStat(pFile->zPath, &buf))!=0
+ || buf.st_ino!=pFile->pInode->fileId.ino)
+ ){
+ sqlite3_log(SQLITE_WARNING, "file renamed while open: %s", pFile->zPath);
+ pFile->ctrlFlags |= UNIXFILE_WARNED;
+ return;
+ }
+}
+
+
+/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
@@ -1795,9 +1874,13 @@ end_unlock:
** the requested locking level, this routine is a no-op.
*/
static int unixUnlock(sqlite3_file *id, int eFileLock){
+ assert( eFileLock==SHARED_LOCK || ((unixFile *)id)->nFetchOut==0 );
return posixUnlock(id, eFileLock, 0);
}
+static int unixMapfile(unixFile *pFd, i64 nByte);
+static void unixUnmapfile(unixFile *pFd);
+
/*
** This function performs the parts of the "close file" operation
** common to all locking schemes. It closes the directory and file
@@ -1810,6 +1893,7 @@ static int unixUnlock(sqlite3_file *id, int eFileLock){
*/
static int closeUnixFile(sqlite3_file *id){
unixFile *pFile = (unixFile*)id;
+ unixUnmapfile(pFile);
if( pFile->h>=0 ){
robust_close(pFile, pFile->h, __LINE__);
pFile->h = -1;
@@ -1836,6 +1920,7 @@ static int closeUnixFile(sqlite3_file *id){
static int unixClose(sqlite3_file *id){
int rc = SQLITE_OK;
unixFile *pFile = (unixFile *)id;
+ verifyDbFile(pFile);
unixUnlock(id, NO_LOCK);
unixEnterMutex();
@@ -1904,7 +1989,7 @@ static int nolockClose(sqlite3_file *id) {
/******************************************************************************
************************* Begin dot-file Locking ******************************
**
-** The dotfile locking implementation uses the existance of separate lock
+** The dotfile locking implementation uses the existence of separate lock
** files (really a directory) to control access to the database. This works
** on just about every filesystem imaginable. But there are serious downsides:
**
@@ -1919,7 +2004,7 @@ static int nolockClose(sqlite3_file *id) {
**
** Dotfile locking works by creating a subdirectory in the same directory as
** the database and with the same name but with a ".lock" extension added.
-** The existance of a lock directory implies an EXCLUSIVE lock. All other
+** The existence of a lock directory implies an EXCLUSIVE lock. All other
** lock types (SHARED, RESERVED, PENDING) are mapped into EXCLUSIVE.
*/
@@ -2086,13 +2171,13 @@ static int dotlockUnlock(sqlite3_file *id, int eFileLock) {
** Close a file. Make sure the lock has been released before closing.
*/
static int dotlockClose(sqlite3_file *id) {
- int rc;
+ int rc = SQLITE_OK;
if( id ){
unixFile *pFile = (unixFile*)id;
dotlockUnlock(id, NO_LOCK);
sqlite3_free(pFile->lockingContext);
+ rc = closeUnixFile(id);
}
- rc = closeUnixFile(id);
return rc;
}
/****************** End of the dot-file lock implementation *******************
@@ -2296,10 +2381,12 @@ static int flockUnlock(sqlite3_file *id, int eFileLock) {
** Close a file.
*/
static int flockClose(sqlite3_file *id) {
+ int rc = SQLITE_OK;
if( id ){
flockUnlock(id, NO_LOCK);
+ rc = closeUnixFile(id);
}
- return closeUnixFile(id);
+ return rc;
}
#endif /* SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORK */
@@ -3010,6 +3097,8 @@ static int seekAndRead(unixFile *id, sqlite3_int64 offset, void *pBuf, int cnt){
i64 newOffset;
#endif
TIMER_START;
+ assert( cnt==(cnt&0x1ffff) );
+ cnt &= 0x1ffff;
do{
#if defined(USE_PREAD)
got = osPread(id->h, pBuf, cnt, offset);
@@ -3063,6 +3152,8 @@ static int unixRead(
unixFile *pFile = (unixFile *)id;
int got;
assert( id );
+ assert( offset>=0 );
+ assert( amt>0 );
/* If this is a database file (not a journal, master-journal or temp
** file), the bytes in the locking range should never be read or written. */
@@ -3073,6 +3164,23 @@ static int unixRead(
);
#endif
+#if SQLITE_MAX_MMAP_SIZE>0
+ /* Deal with as much of this read request as possible by transfering
+ ** data from the memory mapping using memcpy(). */
+ if( offset<pFile->mmapSize ){
+ if( offset+amt <= pFile->mmapSize ){
+ memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], amt);
+ return SQLITE_OK;
+ }else{
+ int nCopy = pFile->mmapSize - offset;
+ memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], nCopy);
+ pBuf = &((u8 *)pBuf)[nCopy];
+ amt -= nCopy;
+ offset += nCopy;
+ }
+ }
+#endif
+
got = seekAndRead(pFile, offset, pBuf, amt);
if( got==amt ){
return SQLITE_OK;
@@ -3088,44 +3196,59 @@ static int unixRead(
}
/*
-** Seek to the offset in id->offset then read cnt bytes into pBuf.
-** Return the number of bytes actually read. Update the offset.
-**
-** To avoid stomping the errno value on a failed write the lastErrno value
-** is set before returning.
+** Attempt to seek the file-descriptor passed as the first argument to
+** absolute offset iOff, then attempt to write nBuf bytes of data from
+** pBuf to it. If an error occurs, return -1 and set *piErrno. Otherwise,
+** return the actual number of bytes written (which may be less than
+** nBuf).
*/
-static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){
- int got;
-#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
- i64 newOffset;
-#endif
+static int seekAndWriteFd(
+ int fd, /* File descriptor to write to */
+ i64 iOff, /* File offset to begin writing at */
+ const void *pBuf, /* Copy data from this buffer to the file */
+ int nBuf, /* Size of buffer pBuf in bytes */
+ int *piErrno /* OUT: Error number if error occurs */
+){
+ int rc = 0; /* Value returned by system call */
+
+ assert( nBuf==(nBuf&0x1ffff) );
+ nBuf &= 0x1ffff;
TIMER_START;
+
#if defined(USE_PREAD)
- do{ got = osPwrite(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR );
+ do{ rc = osPwrite(fd, pBuf, nBuf, iOff); }while( rc<0 && errno==EINTR );
#elif defined(USE_PREAD64)
- do{ got = osPwrite64(id->h, pBuf, cnt, offset);}while( got<0 && errno==EINTR);
+ do{ rc = osPwrite64(fd, pBuf, nBuf, iOff);}while( rc<0 && errno==EINTR);
#else
do{
- newOffset = lseek(id->h, offset, SEEK_SET);
- SimulateIOError( newOffset-- );
- if( newOffset!=offset ){
- if( newOffset == -1 ){
- ((unixFile*)id)->lastErrno = errno;
- }else{
- ((unixFile*)id)->lastErrno = 0;
- }
+ i64 iSeek = lseek(fd, iOff, SEEK_SET);
+ SimulateIOError( iSeek-- );
+
+ if( iSeek!=iOff ){
+ if( piErrno ) *piErrno = (iSeek==-1 ? errno : 0);
return -1;
}
- got = osWrite(id->h, pBuf, cnt);
- }while( got<0 && errno==EINTR );
+ rc = osWrite(fd, pBuf, nBuf);
+ }while( rc<0 && errno==EINTR );
#endif
+
TIMER_END;
- if( got<0 ){
- ((unixFile*)id)->lastErrno = errno;
- }
+ OSTRACE(("WRITE %-3d %5d %7lld %llu\n", fd, rc, iOff, TIMER_ELAPSED));
- OSTRACE(("WRITE %-3d %5d %7lld %llu\n", id->h, got, offset, TIMER_ELAPSED));
- return got;
+ if( rc<0 && piErrno ) *piErrno = errno;
+ return rc;
+}
+
+
+/*
+** Seek to the offset in id->offset then read cnt bytes into pBuf.
+** Return the number of bytes actually read. Update the offset.
+**
+** To avoid stomping the errno value on a failed write the lastErrno value
+** is set before returning.
+*/
+static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){
+ return seekAndWriteFd(id->h, offset, pBuf, cnt, &id->lastErrno);
}
@@ -3175,6 +3298,23 @@ static int unixWrite(
}
#endif
+#if SQLITE_MAX_MMAP_SIZE>0
+ /* Deal with as much of this write request as possible by transfering
+ ** data from the memory mapping using memcpy(). */
+ if( offset<pFile->mmapSize ){
+ if( offset+amt <= pFile->mmapSize ){
+ memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, amt);
+ return SQLITE_OK;
+ }else{
+ int nCopy = pFile->mmapSize - offset;
+ memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, nCopy);
+ pBuf = &((u8 *)pBuf)[nCopy];
+ amt -= nCopy;
+ offset += nCopy;
+ }
+ }
+#endif
+
while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){
amt -= wrote;
offset += wrote;
@@ -3402,7 +3542,7 @@ static int unixSync(sqlite3_file *id, int flags){
}
/* Also fsync the directory containing the file if the DIRSYNC flag
- ** is set. This is a one-time occurrance. Many systems (examples: AIX)
+ ** is set. This is a one-time occurrence. Many systems (examples: AIX)
** are unable to fsync a directory, so ignore errors on the fsync.
*/
if( pFile->ctrlFlags & UNIXFILE_DIRSYNC ){
@@ -3457,6 +3597,14 @@ static int unixTruncate(sqlite3_file *id, i64 nByte){
}
#endif
+ /* If the file was just truncated to a size smaller than the currently
+ ** mapped region, reduce the effective mapping size as well. SQLite will
+ ** use read() and write() to access data beyond this point from now on.
+ */
+ if( nByte<pFile->mmapSize ){
+ pFile->mmapSize = nByte;
+ }
+
return SQLITE_OK;
}
}
@@ -3545,6 +3693,19 @@ static int fcntlSizeHint(unixFile *pFile, i64 nByte){
}
}
+ if( pFile->mmapSizeMax>0 && nByte>pFile->mmapSize ){
+ int rc;
+ if( pFile->szChunk<=0 ){
+ if( robust_ftruncate(pFile->h, nByte) ){
+ pFile->lastErrno = errno;
+ return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath);
+ }
+ }
+
+ rc = unixMapfile(pFile, nByte);
+ return rc;
+ }
+
return SQLITE_OK;
}
@@ -3564,6 +3725,9 @@ static void unixModeBit(unixFile *pFile, unsigned char mask, int *pArg){
}
}
+/* Forward declaration */
+static int unixGetTempname(int nBuf, char *zBuf);
+
/*
** Information and control of an open file handle.
*/
@@ -3601,6 +3765,26 @@ static int unixFileControl(sqlite3_file *id, int op, void *pArg){
*(char**)pArg = sqlite3_mprintf("%s", pFile->pVfs->zName);
return SQLITE_OK;
}
+ case SQLITE_FCNTL_TEMPFILENAME: {
+ char *zTFile = sqlite3_malloc( pFile->pVfs->mxPathname );
+ if( zTFile ){
+ unixGetTempname(pFile->pVfs->mxPathname, zTFile);
+ *(char**)pArg = zTFile;
+ }
+ return SQLITE_OK;
+ }
+ case SQLITE_FCNTL_MMAP_SIZE: {
+ i64 newLimit = *(i64*)pArg;
+ if( newLimit>sqlite3GlobalConfig.mxMmap ){
+ newLimit = sqlite3GlobalConfig.mxMmap;
+ }
+ *(i64*)pArg = pFile->mmapSizeMax;
+ if( newLimit>=0 ){
+ pFile->mmapSizeMax = newLimit;
+ if( newLimit<pFile->mmapSize ) pFile->mmapSize = newLimit;
+ }
+ return SQLITE_OK;
+ }
#ifdef SQLITE_DEBUG
/* The pager calls this method to signal that it has done
** a rollback and that the database is therefore unchanged and
@@ -3632,10 +3816,92 @@ static int unixFileControl(sqlite3_file *id, int op, void *pArg){
** a database and its journal file) that the sector size will be the
** same for both.
*/
-static int unixSectorSize(sqlite3_file *pFile){
- (void)pFile;
+#ifndef __QNXNTO__
+static int unixSectorSize(sqlite3_file *NotUsed){
+ UNUSED_PARAMETER(NotUsed);
return SQLITE_DEFAULT_SECTOR_SIZE;
}
+#endif
+
+/*
+** The following version of unixSectorSize() is optimized for QNX.
+*/
+#ifdef __QNXNTO__
+#include <sys/dcmd_blk.h>
+#include <sys/statvfs.h>
+static int unixSectorSize(sqlite3_file *id){
+ unixFile *pFile = (unixFile*)id;
+ if( pFile->sectorSize == 0 ){
+ struct statvfs fsInfo;
+
+ /* Set defaults for non-supported filesystems */
+ pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE;
+ pFile->deviceCharacteristics = 0;
+ if( fstatvfs(pFile->h, &fsInfo) == -1 ) {
+ return pFile->sectorSize;
+ }
+
+ if( !strcmp(fsInfo.f_basetype, "tmp") ) {
+ pFile->sectorSize = fsInfo.f_bsize;
+ pFile->deviceCharacteristics =
+ SQLITE_IOCAP_ATOMIC4K | /* All ram filesystem writes are atomic */
+ SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until
+ ** the write succeeds */
+ SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind
+ ** so it is ordered */
+ 0;
+ }else if( strstr(fsInfo.f_basetype, "etfs") ){
+ pFile->sectorSize = fsInfo.f_bsize;
+ pFile->deviceCharacteristics =
+ /* etfs cluster size writes are atomic */
+ (pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) |
+ SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until
+ ** the write succeeds */
+ SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind
+ ** so it is ordered */
+ 0;
+ }else if( !strcmp(fsInfo.f_basetype, "qnx6") ){
+ pFile->sectorSize = fsInfo.f_bsize;
+ pFile->deviceCharacteristics =
+ SQLITE_IOCAP_ATOMIC | /* All filesystem writes are atomic */
+ SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until
+ ** the write succeeds */
+ SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind
+ ** so it is ordered */
+ 0;
+ }else if( !strcmp(fsInfo.f_basetype, "qnx4") ){
+ pFile->sectorSize = fsInfo.f_bsize;
+ pFile->deviceCharacteristics =
+ /* full bitset of atomics from max sector size and smaller */
+ ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 |
+ SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind
+ ** so it is ordered */
+ 0;
+ }else if( strstr(fsInfo.f_basetype, "dos") ){
+ pFile->sectorSize = fsInfo.f_bsize;
+ pFile->deviceCharacteristics =
+ /* full bitset of atomics from max sector size and smaller */
+ ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 |
+ SQLITE_IOCAP_SEQUENTIAL | /* The ram filesystem has no write behind
+ ** so it is ordered */
+ 0;
+ }else{
+ pFile->deviceCharacteristics =
+ SQLITE_IOCAP_ATOMIC512 | /* blocks are atomic */
+ SQLITE_IOCAP_SAFE_APPEND | /* growing the file does not occur until
+ ** the write succeeds */
+ 0;
+ }
+ }
+ /* Last chance verification. If the sector size isn't a multiple of 512
+ ** then it isn't valid.*/
+ if( pFile->sectorSize % 512 != 0 ){
+ pFile->deviceCharacteristics = 0;
+ pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE;
+ }
+ return pFile->sectorSize;
+}
+#endif /* __QNXNTO__ */
/*
** Return the device characteristics for the file.
@@ -3652,11 +3918,15 @@ static int unixSectorSize(sqlite3_file *pFile){
*/
static int unixDeviceCharacteristics(sqlite3_file *id){
unixFile *p = (unixFile*)id;
+ int rc = 0;
+#ifdef __QNXNTO__
+ if( p->sectorSize==0 ) unixSectorSize(id);
+ rc = p->deviceCharacteristics;
+#endif
if( p->ctrlFlags & UNIXFILE_PSOW ){
- return SQLITE_IOCAP_POWERSAFE_OVERWRITE;
- }else{
- return 0;
+ rc |= SQLITE_IOCAP_POWERSAFE_OVERWRITE;
}
+ return rc;
}
#ifndef SQLITE_OMIT_WAL
@@ -3827,7 +4097,7 @@ static void unixShmPurge(unixFile *pFd){
sqlite3_mutex_free(p->mutex);
for(i=0; i<p->nRegion; i++){
if( p->h>=0 ){
- munmap(p->apRegion[i], p->szRegion);
+ osMunmap(p->apRegion[i], p->szRegion);
}else{
sqlite3_free(p->apRegion[i]);
}
@@ -4067,16 +4337,32 @@ static int unixShmMap(
if( sStat.st_size<nByte ){
/* The requested memory region does not exist. If bExtend is set to
** false, exit early. *pp will be set to NULL and SQLITE_OK returned.
- **
- ** Alternatively, if bExtend is true, use ftruncate() to allocate
- ** the requested memory region.
*/
- if( !bExtend ) goto shmpage_out;
- if( robust_ftruncate(pShmNode->h, nByte) ){
- rc = unixLogError(SQLITE_IOERR_SHMSIZE, "ftruncate",
- pShmNode->zFilename);
+ if( !bExtend ){
goto shmpage_out;
}
+
+ /* Alternatively, if bExtend is true, extend the file. Do this by
+ ** writing a single byte to the end of each (OS) page being
+ ** allocated or extended. Technically, we need only write to the
+ ** last page in order to extend the file. But writing to all new
+ ** pages forces the OS to allocate them immediately, which reduces
+ ** the chances of SIGBUS while accessing the mapped region later on.
+ */
+ else{
+ static const int pgsz = 4096;
+ int iPg;
+
+ /* Write to the last byte of each newly allocated or extended page */
+ assert( (nByte % pgsz)==0 );
+ for(iPg=(sStat.st_size/pgsz); iPg<(nByte/pgsz); iPg++){
+ if( seekAndWriteFd(pShmNode->h, iPg*pgsz + pgsz-1, "", 1, 0)!=1 ){
+ const char *zFile = pShmNode->zFilename;
+ rc = unixLogError(SQLITE_IOERR_SHMSIZE, "write", zFile);
+ goto shmpage_out;
+ }
+ }
+ }
}
}
@@ -4092,9 +4378,9 @@ static int unixShmMap(
while(pShmNode->nRegion<=iRegion){
void *pMem;
if( pShmNode->h>=0 ){
- pMem = mmap(0, szRegion,
+ pMem = osMmap(0, szRegion,
pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE,
- MAP_SHARED, pShmNode->h, pShmNode->nRegion*szRegion
+ MAP_SHARED, pShmNode->h, szRegion*(i64)pShmNode->nRegion
);
if( pMem==MAP_FAILED ){
rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename);
@@ -4310,6 +4596,236 @@ static int unixShmUnmap(
#endif /* #ifndef SQLITE_OMIT_WAL */
/*
+** If it is currently memory mapped, unmap file pFd.
+*/
+static void unixUnmapfile(unixFile *pFd){
+ assert( pFd->nFetchOut==0 );
+#if SQLITE_MAX_MMAP_SIZE>0
+ if( pFd->pMapRegion ){
+ osMunmap(pFd->pMapRegion, pFd->mmapSizeActual);
+ pFd->pMapRegion = 0;
+ pFd->mmapSize = 0;
+ pFd->mmapSizeActual = 0;
+ }
+#endif
+}
+
+#if SQLITE_MAX_MMAP_SIZE>0
+/*
+** Return the system page size.
+*/
+static int unixGetPagesize(void){
+#if HAVE_MREMAP
+ return 512;
+#elif defined(_BSD_SOURCE)
+ return getpagesize();
+#else
+ return (int)sysconf(_SC_PAGESIZE);
+#endif
+}
+#endif /* SQLITE_MAX_MMAP_SIZE>0 */
+
+#if SQLITE_MAX_MMAP_SIZE>0
+/*
+** Attempt to set the size of the memory mapping maintained by file
+** descriptor pFd to nNew bytes. Any existing mapping is discarded.
+**
+** If successful, this function sets the following variables:
+**
+** unixFile.pMapRegion
+** unixFile.mmapSize
+** unixFile.mmapSizeActual
+**
+** If unsuccessful, an error message is logged via sqlite3_log() and
+** the three variables above are zeroed. In this case SQLite should
+** continue accessing the database using the xRead() and xWrite()
+** methods.
+*/
+static void unixRemapfile(
+ unixFile *pFd, /* File descriptor object */
+ i64 nNew /* Required mapping size */
+){
+ const char *zErr = "mmap";
+ int h = pFd->h; /* File descriptor open on db file */
+ u8 *pOrig = (u8 *)pFd->pMapRegion; /* Pointer to current file mapping */
+ i64 nOrig = pFd->mmapSizeActual; /* Size of pOrig region in bytes */
+ u8 *pNew = 0; /* Location of new mapping */
+ int flags = PROT_READ; /* Flags to pass to mmap() */
+
+ assert( pFd->nFetchOut==0 );
+ assert( nNew>pFd->mmapSize );
+ assert( nNew<=pFd->mmapSizeMax );
+ assert( nNew>0 );
+ assert( pFd->mmapSizeActual>=pFd->mmapSize );
+ assert( MAP_FAILED!=0 );
+
+ if( (pFd->ctrlFlags & UNIXFILE_RDONLY)==0 ) flags |= PROT_WRITE;
+
+ if( pOrig ){
+ const int szSyspage = unixGetPagesize();
+ i64 nReuse = (pFd->mmapSize & ~(szSyspage-1));
+ u8 *pReq = &pOrig[nReuse];
+
+ /* Unmap any pages of the existing mapping that cannot be reused. */
+ if( nReuse!=nOrig ){
+ osMunmap(pReq, nOrig-nReuse);
+ }
+
+#if HAVE_MREMAP
+ pNew = osMremap(pOrig, nReuse, nNew, MREMAP_MAYMOVE);
+ zErr = "mremap";
+#else
+ pNew = osMmap(pReq, nNew-nReuse, flags, MAP_SHARED, h, nReuse);
+ if( pNew!=MAP_FAILED ){
+ if( pNew!=pReq ){
+ osMunmap(pNew, nNew - nReuse);
+ pNew = 0;
+ }else{
+ pNew = pOrig;
+ }
+ }
+#endif
+
+ /* The attempt to extend the existing mapping failed. Free it. */
+ if( pNew==MAP_FAILED || pNew==0 ){
+ osMunmap(pOrig, nReuse);
+ }
+ }
+
+ /* If pNew is still NULL, try to create an entirely new mapping. */
+ if( pNew==0 ){
+ pNew = osMmap(0, nNew, flags, MAP_SHARED, h, 0);
+ }
+
+ if( pNew==MAP_FAILED ){
+ pNew = 0;
+ nNew = 0;
+ unixLogError(SQLITE_OK, zErr, pFd->zPath);
+
+ /* If the mmap() above failed, assume that all subsequent mmap() calls
+ ** will probably fail too. Fall back to using xRead/xWrite exclusively
+ ** in this case. */
+ pFd->mmapSizeMax = 0;
+ }
+ pFd->pMapRegion = (void *)pNew;
+ pFd->mmapSize = pFd->mmapSizeActual = nNew;
+}
+#endif
+
+/*
+** Memory map or remap the file opened by file-descriptor pFd (if the file
+** is already mapped, the existing mapping is replaced by the new). Or, if
+** there already exists a mapping for this file, and there are still
+** outstanding xFetch() references to it, this function is a no-op.
+**
+** If parameter nByte is non-negative, then it is the requested size of
+** the mapping to create. Otherwise, if nByte is less than zero, then the
+** requested size is the size of the file on disk. The actual size of the
+** created mapping is either the requested size or the value configured
+** using SQLITE_FCNTL_MMAP_LIMIT, whichever is smaller.
+**
+** SQLITE_OK is returned if no error occurs (even if the mapping is not
+** recreated as a result of outstanding references) or an SQLite error
+** code otherwise.
+*/
+static int unixMapfile(unixFile *pFd, i64 nByte){
+#if SQLITE_MAX_MMAP_SIZE>0
+ i64 nMap = nByte;
+ int rc;
+
+ assert( nMap>=0 || pFd->nFetchOut==0 );
+ if( pFd->nFetchOut>0 ) return SQLITE_OK;
+
+ if( nMap<0 ){
+ struct stat statbuf; /* Low-level file information */
+ rc = osFstat(pFd->h, &statbuf);
+ if( rc!=SQLITE_OK ){
+ return SQLITE_IOERR_FSTAT;
+ }
+ nMap = statbuf.st_size;
+ }
+ if( nMap>pFd->mmapSizeMax ){
+ nMap = pFd->mmapSizeMax;
+ }
+
+ if( nMap!=pFd->mmapSize ){
+ if( nMap>0 ){
+ unixRemapfile(pFd, nMap);
+ }else{
+ unixUnmapfile(pFd);
+ }
+ }
+#endif
+
+ return SQLITE_OK;
+}
+
+/*
+** If possible, return a pointer to a mapping of file fd starting at offset
+** iOff. The mapping must be valid for at least nAmt bytes.
+**
+** If such a pointer can be obtained, store it in *pp and return SQLITE_OK.
+** Or, if one cannot but no error occurs, set *pp to 0 and return SQLITE_OK.
+** Finally, if an error does occur, return an SQLite error code. The final
+** value of *pp is undefined in this case.
+**
+** If this function does return a pointer, the caller must eventually
+** release the reference by calling unixUnfetch().
+*/
+static int unixFetch(sqlite3_file *fd, i64 iOff, int nAmt, void **pp){
+#if SQLITE_MAX_MMAP_SIZE>0
+ unixFile *pFd = (unixFile *)fd; /* The underlying database file */
+#endif
+ *pp = 0;
+
+#if SQLITE_MAX_MMAP_SIZE>0
+ if( pFd->mmapSizeMax>0 ){
+ if( pFd->pMapRegion==0 ){
+ int rc = unixMapfile(pFd, -1);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+ if( pFd->mmapSize >= iOff+nAmt ){
+ *pp = &((u8 *)pFd->pMapRegion)[iOff];
+ pFd->nFetchOut++;
+ }
+ }
+#endif
+ return SQLITE_OK;
+}
+
+/*
+** If the third argument is non-NULL, then this function releases a
+** reference obtained by an earlier call to unixFetch(). The second
+** argument passed to this function must be the same as the corresponding
+** argument that was passed to the unixFetch() invocation.
+**
+** Or, if the third argument is NULL, then this function is being called
+** to inform the VFS layer that, according to POSIX, any existing mapping
+** may now be invalid and should be unmapped.
+*/
+static int unixUnfetch(sqlite3_file *fd, i64 iOff, void *p){
+ unixFile *pFd = (unixFile *)fd; /* The underlying database file */
+ UNUSED_PARAMETER(iOff);
+
+ /* If p==0 (unmap the entire file) then there must be no outstanding
+ ** xFetch references. Or, if p!=0 (meaning it is an xFetch reference),
+ ** then there must be at least one outstanding. */
+ assert( (p==0)==(pFd->nFetchOut==0) );
+
+ /* If p!=0, it must match the iOff value. */
+ assert( p==0 || p==&((u8 *)pFd->pMapRegion)[iOff] );
+
+ if( p ){
+ pFd->nFetchOut--;
+ }else{
+ unixUnmapfile(pFd);
+ }
+
+ assert( pFd->nFetchOut>=0 );
+ return SQLITE_OK;
+}
+
+/*
** Here ends the implementation of all sqlite3_file methods.
**
********************** End sqlite3_file Methods *******************************
@@ -4367,7 +4883,9 @@ static const sqlite3_io_methods METHOD = { \
unixShmMap, /* xShmMap */ \
unixShmLock, /* xShmLock */ \
unixShmBarrier, /* xShmBarrier */ \
- unixShmUnmap /* xShmUnmap */ \
+ unixShmUnmap, /* xShmUnmap */ \
+ unixFetch, /* xFetch */ \
+ unixUnfetch, /* xUnfetch */ \
}; \
static const sqlite3_io_methods *FINDER##Impl(const char *z, unixFile *p){ \
UNUSED_PARAMETER(z); UNUSED_PARAMETER(p); \
@@ -4384,7 +4902,7 @@ static const sqlite3_io_methods *(*const FINDER)(const char*,unixFile *p) \
IOMETHODS(
posixIoFinder, /* Finder function name */
posixIoMethods, /* sqlite3_io_methods object name */
- 2, /* shared memory is enabled */
+ 3, /* shared memory and mmap are enabled */
unixClose, /* xClose method */
unixLock, /* xLock method */
unixUnlock, /* xUnlock method */
@@ -4635,11 +5153,12 @@ static int fillInUnixFile(
pNew->pVfs = pVfs;
pNew->zPath = zFilename;
pNew->ctrlFlags = (u8)ctrlFlags;
+ pNew->mmapSizeMax = sqlite3GlobalConfig.szMmap;
if( sqlite3_uri_boolean(((ctrlFlags & UNIXFILE_URI) ? zFilename : 0),
"psow", SQLITE_POWERSAFE_OVERWRITE) ){
pNew->ctrlFlags |= UNIXFILE_PSOW;
}
- if( memcmp(pVfs->zName,"unix-excl",10)==0 ){
+ if( strcmp(pVfs->zName,"unix-excl")==0 ){
pNew->ctrlFlags |= UNIXFILE_EXCL;
}
@@ -4671,7 +5190,7 @@ static int fillInUnixFile(
unixEnterMutex();
rc = findInodeInfo(pNew, &pNew->pInode);
if( rc!=SQLITE_OK ){
- /* If an error occured in findInodeInfo(), close the file descriptor
+ /* If an error occurred in findInodeInfo(), close the file descriptor
** immediately, before releasing the mutex. findInodeInfo() may fail
** in two scenarios:
**
@@ -4770,15 +5289,15 @@ static int fillInUnixFile(
if( h>=0 ) robust_close(pNew, h, __LINE__);
h = -1;
osUnlink(zFilename);
- isDelete = 0;
+ pNew->ctrlFlags |= UNIXFILE_DELETE;
}
- if( isDelete ) pNew->ctrlFlags |= UNIXFILE_DELETE;
#endif
if( rc!=SQLITE_OK ){
if( h>=0 ) robust_close(pNew, h, __LINE__);
}else{
pNew->pMethod = pLockingStyle;
OpenCounter(+1);
+ verifyDbFile(pNew);
}
return rc;
}
@@ -5278,8 +5797,13 @@ static int unixDelete(
int rc = SQLITE_OK;
UNUSED_PARAMETER(NotUsed);
SimulateIOError(return SQLITE_IOERR_DELETE);
- if( osUnlink(zPath)==(-1) && errno!=ENOENT ){
- return unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath);
+ if( osUnlink(zPath)==(-1) ){
+ if( errno==ENOENT ){
+ rc = SQLITE_IOERR_DELETE_NOENT;
+ }else{
+ rc = unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath);
+ }
+ return rc;
}
#ifndef SQLITE_DISABLE_DIRSYNC
if( (dirSync & 1)!=0 ){
@@ -5304,7 +5828,7 @@ static int unixDelete(
}
/*
-** Test the existance of or access permissions of file zPath. The
+** Test the existence of or access permissions of file zPath. The
** test performed depends on the value of flags:
**
** SQLITE_ACCESS_EXISTS: Return 1 if the file exists
@@ -6867,7 +7391,7 @@ int sqlite3_os_init(void){
/* Double-check that the aSyscall[] array has been constructed
** correctly. See ticket [bb3a86e890c8e96ab] */
- assert( ArraySize(aSyscall)==22 );
+ assert( ArraySize(aSyscall)==24 );
/* Register all VFSes defined in the aVfs[] array */
for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){