diff options
author | Hans-Christoph Steiner <hans@eds.org> | 2012-09-20 18:34:42 -0400 |
---|---|---|
committer | Hans-Christoph Steiner <hans@eds.org> | 2012-09-20 18:34:42 -0400 |
commit | 734b4f890763e4efafe865ba476c43cc8d1a2214 (patch) | |
tree | d561d2fad0788619f4b8e230073f6af1d416934e /src/wal.c | |
parent | 396b08286e7bb56e0e6440aaf1345c18e72ee22e (diff) | |
parent | 487e15dc239ccdb3344d1c99ce120e872bab4a74 (diff) |
Merge tag 'upstream/2.0.6'
Upstream version 2.0.6
Diffstat (limited to 'src/wal.c')
-rw-r--r-- | src/wal.c | 303 |
1 files changed, 217 insertions, 86 deletions
@@ -414,13 +414,18 @@ struct Wal { u32 iCallback; /* Value to pass to log callback (or 0) */ i64 mxWalSize; /* Truncate WAL to this size upon reset */ int nWiData; /* Size of array apWiData */ + int szFirstBlock; /* Size of first block written to WAL file */ volatile u32 **apWiData; /* Pointer to wal-index content in memory */ u32 szPage; /* Database page size */ i16 readLock; /* Which read lock is being held. -1 for none */ + u8 syncFlags; /* Flags to use to sync header writes */ u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */ u8 writeLock; /* True if in a write transaction */ u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ + u8 truncateOnCommit; /* True to truncate WAL file on commit */ + u8 syncHeader; /* Fsync the WAL header if true */ + u8 padToSectorBoundary; /* Pad transactions out to the next sector */ WalIndexHdr hdr; /* Wal-index header for current transaction */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ @@ -1093,6 +1098,7 @@ static int walIndexRecover(Wal *pWal){ int szPage; /* Page size according to the log */ u32 magic; /* Magic value read from WAL header */ u32 version; /* Magic value read from WAL header */ + int isValid; /* True if this frame is valid */ /* Read in the WAL header. */ rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); @@ -1151,14 +1157,14 @@ static int walIndexRecover(Wal *pWal){ for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){ u32 pgno; /* Database page number for frame */ u32 nTruncate; /* dbsize field from frame header */ - int isValid; /* True if this frame is valid */ /* Read and decode the next log frame. */ + iFrame++; rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); if( rc!=SQLITE_OK ) break; isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); if( !isValid ) break; - rc = walIndexAppend(pWal, ++iFrame, pgno); + rc = walIndexAppend(pWal, iFrame, pgno); if( rc!=SQLITE_OK ) break; /* If nTruncate is non-zero, this is a commit record. */ @@ -1281,6 +1287,8 @@ int sqlite3WalOpen( pRet->readLock = -1; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; + pRet->syncHeader = 1; + pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); /* Open file handle on the write-ahead log file. */ @@ -1295,6 +1303,11 @@ int sqlite3WalOpen( sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ + int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd); + if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; } + if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){ + pRet->padToSectorBoundary = 0; + } *ppWal = pRet; WALTRACE(("WAL%d: opened\n", pRet)); } @@ -1714,7 +1727,7 @@ static int walCheckpoint( i64 nReq = ((i64)mxPage * szPage); rc = sqlite3OsFileSize(pWal->pDbFd, &nSize); if( rc==SQLITE_OK && nSize<nReq ){ - sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq); + sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq); } } @@ -1782,6 +1795,24 @@ static int walCheckpoint( } /* +** If the WAL file is currently larger than nMax bytes in size, truncate +** it to exactly nMax bytes. If an error occurs while doing so, ignore it. +*/ +static void walLimitSize(Wal *pWal, i64 nMax){ + i64 sz; + int rx; + sqlite3BeginBenignMalloc(); + rx = sqlite3OsFileSize(pWal->pWalFd, &sz); + if( rx==SQLITE_OK && (sz > nMax ) ){ + rx = sqlite3OsTruncate(pWal->pWalFd, nMax); + } + sqlite3EndBenignMalloc(); + if( rx ){ + sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + } +} + +/* ** Close a connection to a log file. */ int sqlite3WalClose( @@ -1804,23 +1835,40 @@ int sqlite3WalClose( */ rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE); if( rc==SQLITE_OK ){ - int bPersistWal = -1; if( pWal->exclusiveMode==WAL_NORMAL_MODE ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } rc = sqlite3WalCheckpoint( pWal, SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 ); - sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersistWal); - if( rc==SQLITE_OK && bPersistWal!=1 ){ - isDelete = 1; + if( rc==SQLITE_OK ){ + int bPersist = -1; + sqlite3OsFileControlHint( + pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist + ); + if( bPersist!=1 ){ + /* Try to delete the WAL file if the checkpoint completed and + ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal + ** mode (!bPersist) */ + isDelete = 1; + }else if( pWal->mxWalSize>=0 ){ + /* Try to truncate the WAL file to zero bytes if the checkpoint + ** completed and fsynced (rc==SQLITE_OK) and we are in persistent + ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a + ** non-negative value (pWal->mxWalSize>=0). Note that we truncate + ** to zero bytes as truncating to the journal_size_limit might + ** leave a corrupt WAL file on disk. */ + walLimitSize(pWal, 0); + } } } walIndexClose(pWal, isDelete); sqlite3OsClose(pWal->pWalFd); if( isDelete ){ + sqlite3BeginBenignMalloc(); sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0); + sqlite3EndBenignMalloc(); } WALTRACE(("WAL%p: closed\n", pWal)); sqlite3_free((void *)pWal->apWiData); @@ -2310,7 +2358,7 @@ int sqlite3WalRead( for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){ u32 iFrame = aHash[iKey] + iZero; if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){ - assert( iFrame>iRead ); + /* assert( iFrame>iRead ); -- not true if there is corruption */ iRead = iFrame; } if( (nCollide--)==0 ){ @@ -2349,7 +2397,7 @@ int sqlite3WalRead( iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE; *pInWal = 1; /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - return sqlite3OsRead(pWal->pWalFd, pOut, nOut, iOffset); + return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset); } *pInWal = 0; @@ -2422,6 +2470,7 @@ int sqlite3WalEndWriteTransaction(Wal *pWal){ if( pWal->writeLock ){ walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1); pWal->writeLock = 0; + pWal->truncateOnCommit = 0; } return SQLITE_OK; } @@ -2518,6 +2567,7 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ return rc; } + /* ** This function is called just before writing a set of frames to the log ** file (see sqlite3WalFrames()). It checks to see if, instead of appending @@ -2555,23 +2605,6 @@ static int walRestartLog(Wal *pWal){ int i; /* Loop counter */ u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ - /* Limit the size of WAL file if the journal_size_limit PRAGMA is - ** set to a non-negative value. Log errors encountered - ** during the truncation attempt. */ - if( pWal->mxWalSize>=0 ){ - i64 sz; - int rx; - sqlite3BeginBenignMalloc(); - rx = sqlite3OsFileSize(pWal->pWalFd, &sz); - if( rx==SQLITE_OK && (sz > pWal->mxWalSize) ){ - rx = sqlite3OsTruncate(pWal->pWalFd, pWal->mxWalSize); - } - sqlite3EndBenignMalloc(); - if( rx ){ - sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); - } - } - pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); @@ -2600,6 +2633,74 @@ static int walRestartLog(Wal *pWal){ return rc; } +/* +** Information about the current state of the WAL file and where +** the next fsync should occur - passed from sqlite3WalFrames() into +** walWriteToLog(). +*/ +typedef struct WalWriter { + Wal *pWal; /* The complete WAL information */ + sqlite3_file *pFd; /* The WAL file to which we write */ + sqlite3_int64 iSyncPoint; /* Fsync at this offset */ + int syncFlags; /* Flags for the fsync */ + int szPage; /* Size of one page */ +} WalWriter; + +/* +** Write iAmt bytes of content into the WAL file beginning at iOffset. +** Do a sync when crossing the p->iSyncPoint boundary. +** +** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt, +** first write the part before iSyncPoint, then sync, then write the +** rest. +*/ +static int walWriteToLog( + WalWriter *p, /* WAL to write to */ + void *pContent, /* Content to be written */ + int iAmt, /* Number of bytes to write */ + sqlite3_int64 iOffset /* Start writing at this offset */ +){ + int rc; + if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){ + int iFirstAmt = (int)(p->iSyncPoint - iOffset); + rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset); + if( rc ) return rc; + iOffset += iFirstAmt; + iAmt -= iFirstAmt; + pContent = (void*)(iFirstAmt + (char*)pContent); + assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); + rc = sqlite3OsSync(p->pFd, p->syncFlags); + if( iAmt==0 || rc ) return rc; + } + rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset); + return rc; +} + +/* +** Write out a single frame of the WAL +*/ +static int walWriteOneFrame( + WalWriter *p, /* Where to write the frame */ + PgHdr *pPage, /* The page of the frame to be written */ + int nTruncate, /* The commit flag. Usually 0. >0 for commit */ + sqlite3_int64 iOffset /* Byte offset at which to write */ +){ + int rc; /* Result code from subfunctions */ + void *pData; /* Data actually written */ + u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ +#if defined(SQLITE_HAS_CODEC) + if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM; +#else + pData = pPage->pData; +#endif + walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame); + rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset); + if( rc ) return rc; + /* Write the page data */ + rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame)); + return rc; +} + /* ** Write a set of frames to the log. The caller must hold the write-lock ** on the log file (obtained using sqlite3WalBeginWriteTransaction()). @@ -2614,14 +2715,20 @@ int sqlite3WalFrames( ){ int rc; /* Used to catch return codes */ u32 iFrame; /* Next frame address */ - u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ PgHdr *p; /* Iterator to run through pList with. */ PgHdr *pLast = 0; /* Last frame in list */ - int nLast = 0; /* Number of extra copies of last page */ + int nExtra = 0; /* Number of extra copies of last page */ + int szFrame; /* The size of a single frame */ + i64 iOffset; /* Next byte to write in WAL file */ + WalWriter w; /* The writer */ assert( pList ); assert( pWal->writeLock ); + /* If this frame set completes a transaction, then nTruncate>0. If + ** nTruncate==0 then this frame set does not complete the transaction. */ + assert( (isCommit!=0)==(nTruncate!=0) ); + #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", @@ -2649,7 +2756,7 @@ int sqlite3WalFrames( sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION); sqlite3Put4byte(&aWalHdr[8], szPage); sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt); - sqlite3_randomness(8, pWal->hdr.aSalt); + if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt); memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8); walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum); sqlite3Put4byte(&aWalHdr[24], aCksum[0]); @@ -2659,77 +2766,89 @@ int sqlite3WalFrames( pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN; pWal->hdr.aFrameCksum[0] = aCksum[0]; pWal->hdr.aFrameCksum[1] = aCksum[1]; + pWal->truncateOnCommit = 1; rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0); WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok")); if( rc!=SQLITE_OK ){ return rc; } + + /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless + ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise + ** an out-of-order write following a WAL restart could result in + ** database corruption. See the ticket: + ** + ** http://localhost:591/sqlite/info/ff5be73dee + */ + if( pWal->syncHeader && sync_flags ){ + rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK); + if( rc ) return rc; + } } assert( (int)pWal->szPage==szPage ); - /* Write the log file. */ - for(p=pList; p; p=p->pDirty){ - u32 nDbsize; /* Db-size field for frame header */ - i64 iOffset; /* Write offset in log file */ - void *pData; - - iOffset = walFrameOffset(++iFrame, szPage); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - - /* Populate and write the frame header */ - nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0; -#if defined(SQLITE_HAS_CODEC) - if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM; -#else - pData = p->pData; -#endif - walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame); - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } + /* Setup information needed to write frames into the WAL */ + w.pWal = pWal; + w.pFd = pWal->pWalFd; + w.iSyncPoint = 0; + w.syncFlags = sync_flags; + w.szPage = szPage; + iOffset = walFrameOffset(iFrame+1, szPage); + szFrame = szPage + WAL_FRAME_HDRSIZE; - /* Write the page data */ - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset+sizeof(aFrame)); - if( rc!=SQLITE_OK ){ - return rc; - } + /* Write all frames into the log file exactly once */ + for(p=pList; p; p=p->pDirty){ + int nDbSize; /* 0 normally. Positive == commit flag */ + iFrame++; + assert( iOffset==walFrameOffset(iFrame, szPage) ); + nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0; + rc = walWriteOneFrame(&w, p, nDbSize, iOffset); + if( rc ) return rc; pLast = p; + iOffset += szFrame; } - /* Sync the log file if the 'isSync' flag was specified. */ - if( sync_flags ){ - i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd); - i64 iOffset = walFrameOffset(iFrame+1, szPage); - - assert( isCommit ); - assert( iSegment>0 ); - - iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); - while( iOffset<iSegment ){ - void *pData; -#if defined(SQLITE_HAS_CODEC) - if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM; -#else - pData = pLast->pData; -#endif - walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame); - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); - if( rc!=SQLITE_OK ){ - return rc; - } - iOffset += WAL_FRAME_HDRSIZE; - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset); - if( rc!=SQLITE_OK ){ - return rc; + /* If this is the end of a transaction, then we might need to pad + ** the transaction and/or sync the WAL file. + ** + ** Padding and syncing only occur if this set of frames complete a + ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL + ** or synchonous==OFF, then no padding or syncing are needed. + ** + ** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not + ** needed and only the sync is done. If padding is needed, then the + ** final frame is repeated (with its commit mark) until the next sector + ** boundary is crossed. Only the part of the WAL prior to the last + ** sector boundary is synced; the part of the last frame that extends + ** past the sector boundary is written after the sync. + */ + if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){ + if( pWal->padToSectorBoundary ){ + int sectorSize = sqlite3OsSectorSize(pWal->pWalFd); + w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; + while( iOffset<w.iSyncPoint ){ + rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset); + if( rc ) return rc; + iOffset += szFrame; + nExtra++; } - nLast++; - iOffset += szPage; + }else{ + rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK); } + } - rc = sqlite3OsSync(pWal->pWalFd, sync_flags); + /* If this frame set completes the first transaction in the WAL and + ** if PRAGMA journal_size_limit is set, then truncate the WAL to the + ** journal size limit, if possible. + */ + if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){ + i64 sz = pWal->mxWalSize; + if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){ + sz = walFrameOffset(iFrame+nExtra+1, szPage); + } + walLimitSize(pWal, sz); + pWal->truncateOnCommit = 0; } /* Append data to the wal-index. It is not necessary to lock the @@ -2742,9 +2861,9 @@ int sqlite3WalFrames( iFrame++; rc = walIndexAppend(pWal, iFrame, p->pgno); } - while( nLast>0 && rc==SQLITE_OK ){ + while( rc==SQLITE_OK && nExtra>0 ){ iFrame++; - nLast--; + nExtra--; rc = walIndexAppend(pWal, iFrame, pLast->pgno); } @@ -2949,4 +3068,16 @@ int sqlite3WalHeapMemory(Wal *pWal){ return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE ); } +#ifdef SQLITE_ENABLE_ZIPVFS +/* +** If the argument is not NULL, it points to a Wal object that holds a +** read-lock. This function returns the database page-size if it is known, +** or zero if it is not (or if pWal is NULL). +*/ +int sqlite3WalFramesize(Wal *pWal){ + assert( pWal==0 || pWal->readLock>=0 ); + return (pWal ? pWal->szPage : 0); +} +#endif + #endif /* #ifndef SQLITE_OMIT_WAL */ |