summaryrefslogtreecommitdiff
path: root/src/wal.c
diff options
context:
space:
mode:
authorHans-Christoph Steiner <hans@eds.org>2012-09-20 18:34:38 -0400
committerHans-Christoph Steiner <hans@eds.org>2012-09-20 18:34:38 -0400
commit487e15dc239ccdb3344d1c99ce120e872bab4a74 (patch)
treec986d492f6092ca7b4401d91515f74daed17fae2 /src/wal.c
parent7bb481fda9ecb134804b49c2ce77ca28f7eea583 (diff)
Imported Upstream version 2.0.6
Diffstat (limited to 'src/wal.c')
-rw-r--r--src/wal.c303
1 files changed, 217 insertions, 86 deletions
diff --git a/src/wal.c b/src/wal.c
index f2b3187..b077d27 100644
--- a/src/wal.c
+++ b/src/wal.c
@@ -414,13 +414,18 @@ struct Wal {
u32 iCallback; /* Value to pass to log callback (or 0) */
i64 mxWalSize; /* Truncate WAL to this size upon reset */
int nWiData; /* Size of array apWiData */
+ int szFirstBlock; /* Size of first block written to WAL file */
volatile u32 **apWiData; /* Pointer to wal-index content in memory */
u32 szPage; /* Database page size */
i16 readLock; /* Which read lock is being held. -1 for none */
+ u8 syncFlags; /* Flags to use to sync header writes */
u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */
u8 writeLock; /* True if in a write transaction */
u8 ckptLock; /* True if holding a checkpoint lock */
u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
+ u8 truncateOnCommit; /* True to truncate WAL file on commit */
+ u8 syncHeader; /* Fsync the WAL header if true */
+ u8 padToSectorBoundary; /* Pad transactions out to the next sector */
WalIndexHdr hdr; /* Wal-index header for current transaction */
const char *zWalName; /* Name of WAL file */
u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
@@ -1093,6 +1098,7 @@ static int walIndexRecover(Wal *pWal){
int szPage; /* Page size according to the log */
u32 magic; /* Magic value read from WAL header */
u32 version; /* Magic value read from WAL header */
+ int isValid; /* True if this frame is valid */
/* Read in the WAL header. */
rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
@@ -1151,14 +1157,14 @@ static int walIndexRecover(Wal *pWal){
for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
u32 pgno; /* Database page number for frame */
u32 nTruncate; /* dbsize field from frame header */
- int isValid; /* True if this frame is valid */
/* Read and decode the next log frame. */
+ iFrame++;
rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);
if( rc!=SQLITE_OK ) break;
isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
if( !isValid ) break;
- rc = walIndexAppend(pWal, ++iFrame, pgno);
+ rc = walIndexAppend(pWal, iFrame, pgno);
if( rc!=SQLITE_OK ) break;
/* If nTruncate is non-zero, this is a commit record. */
@@ -1281,6 +1287,8 @@ int sqlite3WalOpen(
pRet->readLock = -1;
pRet->mxWalSize = mxWalSize;
pRet->zWalName = zWalName;
+ pRet->syncHeader = 1;
+ pRet->padToSectorBoundary = 1;
pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
/* Open file handle on the write-ahead log file. */
@@ -1295,6 +1303,11 @@ int sqlite3WalOpen(
sqlite3OsClose(pRet->pWalFd);
sqlite3_free(pRet);
}else{
+ int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd);
+ if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }
+ if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){
+ pRet->padToSectorBoundary = 0;
+ }
*ppWal = pRet;
WALTRACE(("WAL%d: opened\n", pRet));
}
@@ -1714,7 +1727,7 @@ static int walCheckpoint(
i64 nReq = ((i64)mxPage * szPage);
rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
if( rc==SQLITE_OK && nSize<nReq ){
- sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
+ sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
}
}
@@ -1782,6 +1795,24 @@ static int walCheckpoint(
}
/*
+** If the WAL file is currently larger than nMax bytes in size, truncate
+** it to exactly nMax bytes. If an error occurs while doing so, ignore it.
+*/
+static void walLimitSize(Wal *pWal, i64 nMax){
+ i64 sz;
+ int rx;
+ sqlite3BeginBenignMalloc();
+ rx = sqlite3OsFileSize(pWal->pWalFd, &sz);
+ if( rx==SQLITE_OK && (sz > nMax ) ){
+ rx = sqlite3OsTruncate(pWal->pWalFd, nMax);
+ }
+ sqlite3EndBenignMalloc();
+ if( rx ){
+ sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName);
+ }
+}
+
+/*
** Close a connection to a log file.
*/
int sqlite3WalClose(
@@ -1804,23 +1835,40 @@ int sqlite3WalClose(
*/
rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);
if( rc==SQLITE_OK ){
- int bPersistWal = -1;
if( pWal->exclusiveMode==WAL_NORMAL_MODE ){
pWal->exclusiveMode = WAL_EXCLUSIVE_MODE;
}
rc = sqlite3WalCheckpoint(
pWal, SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0
);
- sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersistWal);
- if( rc==SQLITE_OK && bPersistWal!=1 ){
- isDelete = 1;
+ if( rc==SQLITE_OK ){
+ int bPersist = -1;
+ sqlite3OsFileControlHint(
+ pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist
+ );
+ if( bPersist!=1 ){
+ /* Try to delete the WAL file if the checkpoint completed and
+ ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal
+ ** mode (!bPersist) */
+ isDelete = 1;
+ }else if( pWal->mxWalSize>=0 ){
+ /* Try to truncate the WAL file to zero bytes if the checkpoint
+ ** completed and fsynced (rc==SQLITE_OK) and we are in persistent
+ ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a
+ ** non-negative value (pWal->mxWalSize>=0). Note that we truncate
+ ** to zero bytes as truncating to the journal_size_limit might
+ ** leave a corrupt WAL file on disk. */
+ walLimitSize(pWal, 0);
+ }
}
}
walIndexClose(pWal, isDelete);
sqlite3OsClose(pWal->pWalFd);
if( isDelete ){
+ sqlite3BeginBenignMalloc();
sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
+ sqlite3EndBenignMalloc();
}
WALTRACE(("WAL%p: closed\n", pWal));
sqlite3_free((void *)pWal->apWiData);
@@ -2310,7 +2358,7 @@ int sqlite3WalRead(
for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
u32 iFrame = aHash[iKey] + iZero;
if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
- assert( iFrame>iRead );
+ /* assert( iFrame>iRead ); -- not true if there is corruption */
iRead = iFrame;
}
if( (nCollide--)==0 ){
@@ -2349,7 +2397,7 @@ int sqlite3WalRead(
iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;
*pInWal = 1;
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
- return sqlite3OsRead(pWal->pWalFd, pOut, nOut, iOffset);
+ return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);
}
*pInWal = 0;
@@ -2422,6 +2470,7 @@ int sqlite3WalEndWriteTransaction(Wal *pWal){
if( pWal->writeLock ){
walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
pWal->writeLock = 0;
+ pWal->truncateOnCommit = 0;
}
return SQLITE_OK;
}
@@ -2518,6 +2567,7 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
return rc;
}
+
/*
** This function is called just before writing a set of frames to the log
** file (see sqlite3WalFrames()). It checks to see if, instead of appending
@@ -2555,23 +2605,6 @@ static int walRestartLog(Wal *pWal){
int i; /* Loop counter */
u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */
- /* Limit the size of WAL file if the journal_size_limit PRAGMA is
- ** set to a non-negative value. Log errors encountered
- ** during the truncation attempt. */
- if( pWal->mxWalSize>=0 ){
- i64 sz;
- int rx;
- sqlite3BeginBenignMalloc();
- rx = sqlite3OsFileSize(pWal->pWalFd, &sz);
- if( rx==SQLITE_OK && (sz > pWal->mxWalSize) ){
- rx = sqlite3OsTruncate(pWal->pWalFd, pWal->mxWalSize);
- }
- sqlite3EndBenignMalloc();
- if( rx ){
- sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName);
- }
- }
-
pWal->nCkpt++;
pWal->hdr.mxFrame = 0;
sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
@@ -2600,6 +2633,74 @@ static int walRestartLog(Wal *pWal){
return rc;
}
+/*
+** Information about the current state of the WAL file and where
+** the next fsync should occur - passed from sqlite3WalFrames() into
+** walWriteToLog().
+*/
+typedef struct WalWriter {
+ Wal *pWal; /* The complete WAL information */
+ sqlite3_file *pFd; /* The WAL file to which we write */
+ sqlite3_int64 iSyncPoint; /* Fsync at this offset */
+ int syncFlags; /* Flags for the fsync */
+ int szPage; /* Size of one page */
+} WalWriter;
+
+/*
+** Write iAmt bytes of content into the WAL file beginning at iOffset.
+** Do a sync when crossing the p->iSyncPoint boundary.
+**
+** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
+** first write the part before iSyncPoint, then sync, then write the
+** rest.
+*/
+static int walWriteToLog(
+ WalWriter *p, /* WAL to write to */
+ void *pContent, /* Content to be written */
+ int iAmt, /* Number of bytes to write */
+ sqlite3_int64 iOffset /* Start writing at this offset */
+){
+ int rc;
+ if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){
+ int iFirstAmt = (int)(p->iSyncPoint - iOffset);
+ rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);
+ if( rc ) return rc;
+ iOffset += iFirstAmt;
+ iAmt -= iFirstAmt;
+ pContent = (void*)(iFirstAmt + (char*)pContent);
+ assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
+ rc = sqlite3OsSync(p->pFd, p->syncFlags);
+ if( iAmt==0 || rc ) return rc;
+ }
+ rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);
+ return rc;
+}
+
+/*
+** Write out a single frame of the WAL
+*/
+static int walWriteOneFrame(
+ WalWriter *p, /* Where to write the frame */
+ PgHdr *pPage, /* The page of the frame to be written */
+ int nTruncate, /* The commit flag. Usually 0. >0 for commit */
+ sqlite3_int64 iOffset /* Byte offset at which to write */
+){
+ int rc; /* Result code from subfunctions */
+ void *pData; /* Data actually written */
+ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
+#if defined(SQLITE_HAS_CODEC)
+ if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM;
+#else
+ pData = pPage->pData;
+#endif
+ walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);
+ rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);
+ if( rc ) return rc;
+ /* Write the page data */
+ rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));
+ return rc;
+}
+
/*
** Write a set of frames to the log. The caller must hold the write-lock
** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
@@ -2614,14 +2715,20 @@ int sqlite3WalFrames(
){
int rc; /* Used to catch return codes */
u32 iFrame; /* Next frame address */
- u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
PgHdr *p; /* Iterator to run through pList with. */
PgHdr *pLast = 0; /* Last frame in list */
- int nLast = 0; /* Number of extra copies of last page */
+ int nExtra = 0; /* Number of extra copies of last page */
+ int szFrame; /* The size of a single frame */
+ i64 iOffset; /* Next byte to write in WAL file */
+ WalWriter w; /* The writer */
assert( pList );
assert( pWal->writeLock );
+ /* If this frame set completes a transaction, then nTruncate>0. If
+ ** nTruncate==0 then this frame set does not complete the transaction. */
+ assert( (isCommit!=0)==(nTruncate!=0) );
+
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
{ int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
@@ -2649,7 +2756,7 @@ int sqlite3WalFrames(
sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);
sqlite3Put4byte(&aWalHdr[8], szPage);
sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);
- sqlite3_randomness(8, pWal->hdr.aSalt);
+ if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt);
memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);
walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);
sqlite3Put4byte(&aWalHdr[24], aCksum[0]);
@@ -2659,77 +2766,89 @@ int sqlite3WalFrames(
pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;
pWal->hdr.aFrameCksum[0] = aCksum[0];
pWal->hdr.aFrameCksum[1] = aCksum[1];
+ pWal->truncateOnCommit = 1;
rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
if( rc!=SQLITE_OK ){
return rc;
}
+
+ /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
+ ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise
+ ** an out-of-order write following a WAL restart could result in
+ ** database corruption. See the ticket:
+ **
+ ** http://localhost:591/sqlite/info/ff5be73dee
+ */
+ if( pWal->syncHeader && sync_flags ){
+ rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
+ if( rc ) return rc;
+ }
}
assert( (int)pWal->szPage==szPage );
- /* Write the log file. */
- for(p=pList; p; p=p->pDirty){
- u32 nDbsize; /* Db-size field for frame header */
- i64 iOffset; /* Write offset in log file */
- void *pData;
-
- iOffset = walFrameOffset(++iFrame, szPage);
- /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
-
- /* Populate and write the frame header */
- nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
-#if defined(SQLITE_HAS_CODEC)
- if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
-#else
- pData = p->pData;
-#endif
- walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame);
- rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
- if( rc!=SQLITE_OK ){
- return rc;
- }
+ /* Setup information needed to write frames into the WAL */
+ w.pWal = pWal;
+ w.pFd = pWal->pWalFd;
+ w.iSyncPoint = 0;
+ w.syncFlags = sync_flags;
+ w.szPage = szPage;
+ iOffset = walFrameOffset(iFrame+1, szPage);
+ szFrame = szPage + WAL_FRAME_HDRSIZE;
- /* Write the page data */
- rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset+sizeof(aFrame));
- if( rc!=SQLITE_OK ){
- return rc;
- }
+ /* Write all frames into the log file exactly once */
+ for(p=pList; p; p=p->pDirty){
+ int nDbSize; /* 0 normally. Positive == commit flag */
+ iFrame++;
+ assert( iOffset==walFrameOffset(iFrame, szPage) );
+ nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;
+ rc = walWriteOneFrame(&w, p, nDbSize, iOffset);
+ if( rc ) return rc;
pLast = p;
+ iOffset += szFrame;
}
- /* Sync the log file if the 'isSync' flag was specified. */
- if( sync_flags ){
- i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd);
- i64 iOffset = walFrameOffset(iFrame+1, szPage);
-
- assert( isCommit );
- assert( iSegment>0 );
-
- iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
- while( iOffset<iSegment ){
- void *pData;
-#if defined(SQLITE_HAS_CODEC)
- if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
-#else
- pData = pLast->pData;
-#endif
- walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame);
- /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
- rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
- if( rc!=SQLITE_OK ){
- return rc;
- }
- iOffset += WAL_FRAME_HDRSIZE;
- rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset);
- if( rc!=SQLITE_OK ){
- return rc;
+ /* If this is the end of a transaction, then we might need to pad
+ ** the transaction and/or sync the WAL file.
+ **
+ ** Padding and syncing only occur if this set of frames complete a
+ ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL
+ ** or synchonous==OFF, then no padding or syncing are needed.
+ **
+ ** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not
+ ** needed and only the sync is done. If padding is needed, then the
+ ** final frame is repeated (with its commit mark) until the next sector
+ ** boundary is crossed. Only the part of the WAL prior to the last
+ ** sector boundary is synced; the part of the last frame that extends
+ ** past the sector boundary is written after the sync.
+ */
+ if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){
+ if( pWal->padToSectorBoundary ){
+ int sectorSize = sqlite3OsSectorSize(pWal->pWalFd);
+ w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;
+ while( iOffset<w.iSyncPoint ){
+ rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);
+ if( rc ) return rc;
+ iOffset += szFrame;
+ nExtra++;
}
- nLast++;
- iOffset += szPage;
+ }else{
+ rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK);
}
+ }
- rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
+ /* If this frame set completes the first transaction in the WAL and
+ ** if PRAGMA journal_size_limit is set, then truncate the WAL to the
+ ** journal size limit, if possible.
+ */
+ if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){
+ i64 sz = pWal->mxWalSize;
+ if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){
+ sz = walFrameOffset(iFrame+nExtra+1, szPage);
+ }
+ walLimitSize(pWal, sz);
+ pWal->truncateOnCommit = 0;
}
/* Append data to the wal-index. It is not necessary to lock the
@@ -2742,9 +2861,9 @@ int sqlite3WalFrames(
iFrame++;
rc = walIndexAppend(pWal, iFrame, p->pgno);
}
- while( nLast>0 && rc==SQLITE_OK ){
+ while( rc==SQLITE_OK && nExtra>0 ){
iFrame++;
- nLast--;
+ nExtra--;
rc = walIndexAppend(pWal, iFrame, pLast->pgno);
}
@@ -2949,4 +3068,16 @@ int sqlite3WalHeapMemory(Wal *pWal){
return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE );
}
+#ifdef SQLITE_ENABLE_ZIPVFS
+/*
+** If the argument is not NULL, it points to a Wal object that holds a
+** read-lock. This function returns the database page-size if it is known,
+** or zero if it is not (or if pWal is NULL).
+*/
+int sqlite3WalFramesize(Wal *pWal){
+ assert( pWal==0 || pWal->readLock>=0 );
+ return (pWal ? pWal->szPage : 0);
+}
+#endif
+
#endif /* #ifndef SQLITE_OMIT_WAL */