diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c index 0d9a64c..f192081 100644 --- a/contrib/pg_upgrade/controldata.c +++ b/contrib/pg_upgrade/controldata.c @@ -56,6 +56,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) bool got_toast = false; bool got_date_is_int = false; bool got_float8_pass_by_value = false; + bool got_data_checksums = false; char *lc_collate = NULL; char *lc_ctype = NULL; char *lc_monetary = NULL; @@ -131,6 +132,13 @@ get_control_data(ClusterInfo *cluster, bool live_check) got_float8_pass_by_value = true; } + /* Only in <= 9.2 */ + if (GET_MAJOR_VERSION(cluster->major_version) <= 902) + { + cluster->controldata.data_checksums = false; + got_data_checksums = true; + } + /* we have the result of cmd in "output". so parse it line by line now */ while (fgets(bufin, sizeof(bufin), output)) { @@ -393,6 +401,18 @@ get_control_data(ClusterInfo *cluster, bool live_check) cluster->controldata.float8_pass_by_value = strstr(p, "by value") != NULL; got_float8_pass_by_value = true; } + else if ((p = strstr(bufin, "checksums")) != NULL) + { + p = strchr(p, ':'); + + if (p == NULL || strlen(p) <= 1) + pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__); + + p++; /* removing ':' char */ + /* used later for contrib check */ + cluster->controldata.data_checksums = strstr(p, "enabled") != NULL; + got_data_checksums = true; + } /* In pre-8.4 only */ else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL) { @@ -475,7 +495,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) !got_tli || !got_align || !got_blocksz || !got_largesz || !got_walsz || !got_walseg || !got_ident || !got_index || !got_toast || - !got_date_is_int || !got_float8_pass_by_value) + !got_date_is_int || !got_float8_pass_by_value || !got_data_checksums) { pg_log(PG_REPORT, "The %s cluster lacks some required control information:\n", @@ -533,6 +553,10 @@ get_control_data(ClusterInfo *cluster, bool live_check) if (!got_float8_pass_by_value) pg_log(PG_REPORT, " float8 argument passing method\n"); + /* value added in Postgres 9.3 */ + if (!got_data_checksums) + pg_log(PG_REPORT, " data checksums\n"); + pg_log(PG_FATAL, "Cannot continue without required control information, terminating\n"); } @@ -594,6 +618,12 @@ check_control_data(ControlData *oldctrl, "--disable-integer-datetimes or get server binaries built with those\n" "options.\n"); } + + if (oldctrl->data_checksums != newctrl->data_checksums) + { + pg_log(PG_FATAL, + "old and new pg_controldata checksums settings are invalid or do not match\n"); + } } diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h index c110e45..b544200 100644 --- a/contrib/pg_upgrade/pg_upgrade.h +++ b/contrib/pg_upgrade/pg_upgrade.h @@ -199,6 +199,7 @@ typedef struct uint32 toast; bool date_is_int; bool float8_pass_by_value; + bool data_checksums; char *lc_collate; char *lc_ctype; char *encoding; diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index a1e46eb..5912326 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -183,6 +183,19 @@ PostgreSQL documentation + + + + + Use checksums on data pages to help detect corruption by the + I/O system that would otherwise be silent. Enabling checksums + may incur a slight performance penalty. This option can only + be set during initialization, and cannot be changed later. See . + + + + + diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index fc5c3b2..482adc3 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -177,6 +177,32 @@ (BBU) disk controllers do not prevent partial page writes unless they guarantee that data is written to the BBU as full (8kB) pages. + + + Checksums + + + checksums + + + + Even data recorded to disk may be lost due to media failure or + other corruption. While PostgreSQL cannot do anything to prevent + such loss, checksums allow early detection of those + problems. Detecting such corruption quickly is crucial before + taking a backup or rebuilding a replication slave; otherwise, + there is a chance that the corruption could make it to the backup + or replica. + + + + The WAL is always protected by a checksum, which prevents + corrupted WAL records from being replayed during recovery. To + protect data pages, so that corrupt data pages aren't read into + shared memory, checksums must be enabled + using . + + diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 2813dd1..b7ad845 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -287,9 +287,10 @@ hashgettuple(PG_FUNCTION_ARGS) /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the - * buffer dirty but don't make a WAL log entry. + * buffer dirty, but avoid writing WAL unless we require a + * full-page image (e.g. if checksums are enabled). */ - SetBufferCommitInfoNeedsSave(buf); + MarkBufferDirtyHint(buf); } /* diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index e91419b..05bf6a9 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -712,6 +712,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); + /* no need to set page checksum for all-zero pages */ smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index febf4d6..392c5ab 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -4880,7 +4880,6 @@ l4: recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata); PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); @@ -5752,17 +5751,23 @@ log_heap_freeze(Relation reln, Buffer buffer, * being marked all-visible, and vm_buffer is the buffer containing the * corresponding visibility map block. Both should have already been modified * and dirtied. + * + * If checksums are enabled, we also add the heap_buffer to the chain to + * protect it from being torn. */ XLogRecPtr -log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, +log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid) { xl_heap_visible xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; + XLogRecData rdata[3]; + + Assert(BufferIsValid(heap_buffer)); + Assert(BufferIsValid(vm_buffer)); xlrec.node = rnode; - xlrec.block = block; + xlrec.block = BufferGetBlockNumber(heap_buffer); xlrec.cutoff_xid = cutoff_xid; rdata[0].data = (char *) &xlrec; @@ -5776,6 +5781,17 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, rdata[1].buffer_std = false; rdata[1].next = NULL; + if (DataChecksumsEnabled()) + { + rdata[1].next = &(rdata[2]); + + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].buffer = heap_buffer; + rdata[2].buffer_std = true; + rdata[2].next = NULL; + } + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata); return recptr; @@ -6137,8 +6153,6 @@ static void heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) { xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); - Buffer buffer; - Page page; /* * If there are any Hot Standby transactions running that have an xmin @@ -6153,39 +6167,56 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node); /* - * Read the heap page, if it still exists. If the heap file has been - * dropped or truncated later in recovery, we don't need to update the - * page, but we'd better still update the visibility map. + * If heap block was backed up, restore it. This can only happen with + * checksums enabled. */ - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, - RBM_NORMAL); - if (BufferIsValid(buffer)) + if (record->xl_info & XLR_BKP_BLOCK(1)) { - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - page = (Page) BufferGetPage(buffer); + Assert(DataChecksumsEnabled()); + (void) RestoreBackupBlock(lsn, record, 1, false, false); + } + else + { + Buffer buffer; + Page page; /* - * We don't bump the LSN of the heap page when setting the visibility - * map bit, because that would generate an unworkable volume of - * full-page writes. This exposes us to torn page hazards, but since - * we're not inspecting the existing page contents in any way, we - * don't care. - * - * However, all operations that clear the visibility map bit *do* bump - * the LSN, and those operations will only be replayed if the XLOG LSN - * follows the page LSN. Thus, if the page LSN has advanced past our - * XLOG record's LSN, we mustn't mark the page all-visible, because - * the subsequent update won't be replayed to clear the flag. + * Read the heap page, if it still exists. If the heap file has been + * dropped or truncated later in recovery, we don't need to update the + * page, but we'd better still update the visibility map. */ - if (lsn > PageGetLSN(page)) + buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, + xlrec->block, RBM_NORMAL); + if (BufferIsValid(buffer)) { - PageSetAllVisible(page); - MarkBufferDirty(buffer); - } + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* Done with heap page. */ - UnlockReleaseBuffer(buffer); + page = (Page) BufferGetPage(buffer); + + /* + * We don't bump the LSN of the heap page when setting the + * visibility map bit (unless checksums are enabled, in which case + * we must), because that would generate an unworkable volume of + * full-page writes. This exposes us to torn page hazards, but + * since we're not inspecting the existing page contents in any + * way, we don't care. + * + * However, all operations that clear the visibility map bit *do* + * bump the LSN, and those operations will only be replayed if the + * XLOG LSN follows the page LSN. Thus, if the page LSN has + * advanced past our XLOG record's LSN, we mustn't mark the page + * all-visible, because the subsequent update won't be replayed to + * clear the flag. + */ + if (lsn > PageGetLSN(page)) + { + PageSetAllVisible(page); + MarkBufferDirty(buffer); + } + + /* Done with heap page. */ + UnlockReleaseBuffer(buffer); + } } /* @@ -6216,7 +6247,7 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) * real harm is done; and the next VACUUM will fix it. */ if (lsn > PageGetLSN(BufferGetPage(vmbuffer))) - visibilitymap_set(reln, xlrec->block, lsn, vmbuffer, + visibilitymap_set(reln, xlrec->block, InvalidBuffer, lsn, vmbuffer, xlrec->cutoff_xid); ReleaseBuffer(vmbuffer); @@ -6965,7 +6996,6 @@ heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 0fc032e..2ab723d 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -262,7 +262,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); - SetBufferCommitInfoNeedsSave(buffer); + MarkBufferDirtyHint(buffer); } } diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 84472f8..8a22ecc 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -273,6 +273,8 @@ end_heap_rewrite(RewriteState state) /* Write the last page, if any */ if (state->rs_buffer_valid) { + PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); + if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, @@ -614,6 +616,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { /* Doesn't fit, so write out the existing page */ + PageSetChecksumInplace(page, state->rs_blockno); + /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3209c87..af64fe9 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -233,13 +233,18 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * + * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling + * this function. Except in recovery, caller should also pass the heap + * buffer. When checksums are enabled and we're not in recovery, we must add + * the heap buffer to the WAL chain to protect it from being torn. + * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void -visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, - Buffer buf, TransactionId cutoff_xid) +visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); @@ -252,34 +257,55 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); + Assert(InRecovery || BufferIsValid(heapBuf)); - /* Check that we have the right page pinned */ - if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock) - elog(ERROR, "wrong buffer passed to visibilitymap_set"); + /* Check that we have the right heap page pinned, if present */ + if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) + elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); - page = BufferGetPage(buf); + /* Check that we have the right VM page pinned */ + if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) + elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); + + page = BufferGetPage(vmBuf); map = PageGetContents(page); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); - MarkBufferDirty(buf); + MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) - recptr = log_heap_visible(rel->rd_node, heapBlk, buf, + { + Assert(!InRecovery); + recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); + + /* + * If data checksums are enabled, we need to protect the heap + * page from being torn. + */ + if (DataChecksumsEnabled()) + { + Page heapPage = BufferGetPage(heapBuf); + + /* caller is expected to set PD_ALL_VISIBLE first */ + Assert(PageIsAllVisible(heapPage)); + PageSetLSN(heapPage, recptr); + } + } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } - LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); } /* @@ -579,6 +605,8 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { + PageSetChecksumInplace(pg, vm_nblocks_now); + smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, (char *) pg, false); vm_nblocks_now++; diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 63e099b..8557ef0 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -407,11 +407,16 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; - /* be sure to mark the proper buffer dirty... */ + + /* + * Be sure to mark the proper buffer dirty. If checksums + * are enabled, this may also require a full-page image + * (see comments in MarkBufferDirtyHint). + */ if (nbuf != InvalidBuffer) - SetBufferCommitInfoNeedsSave(nbuf); + MarkBufferDirtyHint(nbuf); else - SetBufferCommitInfoNeedsSave(buf); + MarkBufferDirtyHint(buf); } } } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 0e04168..621b055 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -217,6 +217,7 @@ btbuildempty(PG_FUNCTION_ARGS) _bt_initmetapage(metapage, P_NONE, 0); /* Write the page. If archiving/streaming, XLOG it. */ + PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); if (XLogIsNeeded()) @@ -1051,7 +1052,7 @@ restart: opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; - SetBufferCommitInfoNeedsSave(buf); + MarkBufferDirtyHint(buf); } } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index abd9995..ce444cf 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -288,12 +288,15 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + /* no need to set checksum for all-zero pages */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } + PageSetChecksumInplace(page, blkno); + /* * Now write the page. There's no need for smgr to schedule an fsync for * this write; we'll do it ourselves before ending the build. diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 04e1ac4..ea02aa0 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1783,7 +1783,8 @@ _bt_killitems(IndexScanDesc scan, bool haveLock) /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty - * but don't make a WAL log entry. + * but avoid writing WAL unless we require a full-page image (e.g. if + * checksums are enabled). * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. @@ -1791,7 +1792,7 @@ _bt_killitems(IndexScanDesc scan, bool haveLock) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - SetBufferCommitInfoNeedsSave(so->currPos.buf); + MarkBufferDirtyHint(so->currPos.buf); } if (!haveLock) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index b22e66e..52cf759 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -81,6 +81,10 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfo(buf, "restore point: %s", xlrec->rp_name); } + else if (info == XLOG_HINT) + { + appendStringInfo(buf, "page hint"); + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index ac01fd2..94384ac 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -154,6 +154,7 @@ spgbuildempty(PG_FUNCTION_ARGS) SpGistInitMetapage(page); /* Write the page. If archiving/streaming, XLOG it. */ + PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); if (XLogIsNeeded()) @@ -163,6 +164,7 @@ spgbuildempty(PG_FUNCTION_ARGS) /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); + PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, (char *) page, true); if (XLogIsNeeded()) @@ -172,6 +174,7 @@ spgbuildempty(PG_FUNCTION_ARGS) /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); + PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, (char *) page, true); if (XLogIsNeeded()) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 208b6de..124b2ce 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -61,6 +61,7 @@ #include "utils/timestamp.h" #include "pg_trace.h" +extern bool bootstrap_data_checksums; /* File path names (all relative to $PGDATA) */ #define RECOVERY_COMMAND_FILE "recovery.conf" @@ -731,6 +732,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT); uint8 info_orig = info; static XLogRecord *rechdr; @@ -1001,6 +1003,18 @@ begin:; } /* + * If this is a hint record and we don't need a backup block then + * we have no more work to do and can exit quickly without inserting + * a WAL record at all. In that case return InvalidXLogRecPtr. + */ + if (isHint && !(info & XLR_BKP_BLOCK_MASK)) + { + LWLockRelease(WALInsertLock); + END_CRIT_SECTION(); + return InvalidXLogRecPtr; + } + + /* * If the current page is completely full, the record goes to the next * page, right after the page header. */ @@ -3188,6 +3202,11 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); } + /* + * Any checksum set on this page will be invalid. We don't need + * to reset it here since it will be set before being written. + */ + PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -3793,6 +3812,16 @@ GetFakeLSNForUnloggedRel(void) } /* + * Are checksums enabled for data pages? + */ +bool +DataChecksumsEnabled(void) +{ + Assert(ControlFile != NULL); + return ControlFile->data_checksums; +} + +/* * Auto-tune the number of XLOG buffers. * * The preferred setting for wal_buffers is about 3% of shared_buffers, with @@ -4093,6 +4122,7 @@ BootStrapXLOG(void) ControlFile->max_prepared_xacts = max_prepared_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; + ControlFile->data_checksums = bootstrap_data_checksums; /* some additional ControlFile fields are set in WriteControlFile() */ @@ -7609,6 +7639,51 @@ XLogRestorePoint(const char *rpName) } /* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Deciding the "if needed" part is delicate and requires us to either + * grab WALInsertLock or check the info_lck spinlock. If we check the + * spinlock and it says Yes then we will need to get WALInsertLock as well, + * so the design choice here is to just go straight for the WALInsertLock + * and trust that calls to this function are minimised elsewhere. + * + * Callable while holding just share lock on the buffer content. + * + * Possible that multiple concurrent backends could attempt to write + * WAL records. In that case, more than one backup block may be recorded + * though that isn't important to the outcome and the backup blocks are + * likely to be identical anyway. + */ +#define XLOG_HINT_WATERMARK 13579 +XLogRecPtr +XLogSaveBufferForHint(Buffer buffer) +{ + /* + * Make an XLOG entry reporting the hint + */ + XLogRecData rdata[2]; + int watermark = XLOG_HINT_WATERMARK; + + /* + * Not allowed to have zero-length records, so use a small watermark + */ + rdata[0].data = (char *) (&watermark); + rdata[0].len = sizeof(int); + rdata[0].buffer = InvalidBuffer; + rdata[0].buffer_std = false; + rdata[0].next = &(rdata[1]); + + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].next = NULL; + + return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata); +} + +/* * Check if any of the GUC parameters that are critical for hot standby * have changed, and update the value in pg_control file if necessary. */ @@ -7774,8 +7849,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; - /* Backup blocks are not used in xlog records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + /* Backup blocks are not used in most xlog records */ + Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_NEXTOID) { @@ -7968,6 +8043,34 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) { /* nothing to do here */ } + else if (info == XLOG_HINT) + { +#ifdef USE_ASSERT_CHECKING + int *watermark = (int *) XLogRecGetData(record); +#endif + + /* Check the watermark is correct for the hint record */ + Assert(*watermark == XLOG_HINT_WATERMARK); + + /* Backup blocks must be present for smgr hint records */ + Assert(record->xl_info & XLR_BKP_BLOCK_MASK); + + /* + * Hint records have no information that needs to be replayed. + * The sole purpose of them is to ensure that a hint bit does + * not cause a checksum invalidation if a hint bit write should + * cause a torn page. So the body of the record is empty but + * there must be one backup block. + * + * Since the only change in the backup block is a hint bit, + * there is no confict with Hot Standby. + * + * This also means there is no corresponding API call for this, + * so an smgr implementation has no need to implement anything. + * Which means nothing is needed in md.c etc + */ + RestoreBackupBlock(lsn, record, 0, false, false); + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 82ef726..287f19b 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -48,6 +48,8 @@ extern int optind; extern char *optarg; +bool bootstrap_data_checksums = false; + #define ALLOC(t, c) ((t *) calloc((unsigned)(c), sizeof(t))) @@ -233,7 +235,7 @@ AuxiliaryProcessMain(int argc, char *argv[]) /* If no -x argument, we are a CheckerProcess */ MyAuxProcType = CheckerProcess; - while ((flag = getopt(argc, argv, "B:c:d:D:Fr:x:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:-:")) != -1) { switch (flag) { @@ -259,6 +261,9 @@ AuxiliaryProcessMain(int argc, char *argv[]) case 'F': SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); break; + case 'k': + bootstrap_data_checksums = true; + break; case 'r': strlcpy(OutputFileName, optarg, MAXPGPATH); break; diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index ac60ccb..7833c1f 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1115,7 +1115,7 @@ read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple) HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID; - SetBufferCommitInfoNeedsSave(*buf); + MarkBufferDirtyHint(*buf); } seq = (Form_pg_sequence) GETSTRUCT(seqtuple); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index eeddd9a..d82481b 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8816,6 +8816,8 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, smgrread(src, forkNum, blkno, buf); + PageSetChecksumInplace(page, blkno); + /* XLOG stuff */ if (use_wal) log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4d3364a..d392698 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -672,8 +672,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { PageSetAllVisible(page); MarkBufferDirty(buf); - visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, - InvalidTransactionId); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId); } UnlockReleaseBuffer(buf); @@ -907,8 +907,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { PageSetAllVisible(page); MarkBufferDirty(buf); - visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, - visibility_cutoff_xid); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid); } else if (!all_visible_according_to_vm) { @@ -918,8 +918,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * allowed. Set the visibility map bit as well so that we get * back in sync. */ - visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, - visibility_cutoff_xid); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid); } } @@ -1154,7 +1154,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, { Assert(BufferIsValid(*vmbuffer)); PageSetAllVisible(page); - visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, *vmbuffer, + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer, visibility_cutoff_xid); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index ea7d469..32eb49c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -460,7 +460,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* check for garbage data */ - if (!PageHeaderIsValid((PageHeader) bufBlock)) + if (!PageIsVerified((Page) bufBlock, blockNum)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { @@ -655,14 +655,23 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * victim. We need lock to inspect the page LSN, so this * can't be done inside StrategyGetBuffer. */ - if (strategy != NULL && - XLogNeedsFlush(BufferGetLSN(buf)) && - StrategyRejectBuffer(strategy, buf)) + if (strategy != NULL) { - /* Drop lock/pin and loop around for another buffer */ - LWLockRelease(buf->content_lock); - UnpinBuffer(buf, true); - continue; + XLogRecPtr lsn; + + /* Read the LSN while holding buffer header lock */ + LockBufHdr(buf); + lsn = BufferGetLSN(buf); + UnlockBufHdr(buf); + + if (XLogNeedsFlush(lsn) && + StrategyRejectBuffer(strategy, buf)) + { + /* Drop lock/pin and loop around for another buffer */ + LWLockRelease(buf->content_lock); + UnpinBuffer(buf, true); + continue; + } } /* OK, do the I/O */ @@ -1894,6 +1903,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) ErrorContextCallback errcallback; instr_time io_start, io_time; + Block bufBlock; + char *bufToWrite; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns @@ -1919,6 +1930,18 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); + LockBufHdr(buf); + + /* + * Run PageGetLSN while holding header lock, since we don't have the + * buffer locked exclusively in all cases. + */ + recptr = BufferGetLSN(buf); + + /* To check if block content changes while flushing. - vadim 01/17/97 */ + buf->flags &= ~BM_JUST_DIRTIED; + UnlockBufHdr(buf); + /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes @@ -1938,7 +1961,6 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) */ if (buf->flags & BM_PERMANENT) { - recptr = BufferGetLSN(buf); XLogFlush(recptr); } @@ -1948,18 +1970,20 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * we have the io_in_progress lock. */ - /* To check if block content changes while flushing. - vadim 01/17/97 */ - LockBufHdr(buf); - buf->flags &= ~BM_JUST_DIRTIED; - UnlockBufHdr(buf); + bufBlock = BufHdrGetBlock(buf); + + bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); + /* + * bufToWrite is either the shared buffer or a copy, as appropriate. + */ smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, - (char *) BufHdrGetBlock(buf), + bufToWrite, false); if (track_io_timing) @@ -2497,22 +2521,24 @@ IncrBufferRefCount(Buffer buffer) } /* - * SetBufferCommitInfoNeedsSave + * MarkBufferDirtyHint * - * Mark a buffer dirty when we have updated tuple commit-status bits in it. + * Mark a buffer dirty for non-critical changes. * - * This is essentially the same as MarkBufferDirty, except that the caller - * might have only share-lock instead of exclusive-lock on the buffer's - * content lock. We preserve the distinction mainly as a way of documenting - * that the caller has not made a critical data change --- the status-bit - * update could be redone by someone else just as easily. Therefore, no WAL - * log record need be generated, whereas calls to MarkBufferDirty really ought - * to be associated with a WAL-entry-creating action. + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_HINT WAL record to protect against torn pages. + * 2. The caller might have only share-lock instead of exclusive-lock on the + * buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (due to a race condition), so it cannot be used for important changes. */ void -SetBufferCommitInfoNeedsSave(Buffer buffer) +MarkBufferDirtyHint(Buffer buffer) { volatile BufferDesc *bufHdr; + Page page = BufferGetPage(buffer); if (!BufferIsValid(buffer)) elog(ERROR, "bad buffer ID: %d", buffer); @@ -2532,28 +2558,105 @@ SetBufferCommitInfoNeedsSave(Buffer buffer) /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many - * tuples. So, be as quick as we can if the buffer is already dirty. We - * do this by not acquiring spinlock if it looks like the status bits are - * already. Since we make this test unlocked, there's a chance we might - * fail to notice that the flags have just been cleared, and failed to - * reset them, due to memory-ordering issues. But since this function is - * only intended to be used in cases where failing to write out the data + * tuples. So, be as quick as we can if the buffer is already dirty. We do + * this by not acquiring spinlock if it looks like the status bits are + * already set. Since we make this test unlocked, there's a chance we + * might fail to notice that the flags have just been cleared, and failed + * to reset them, due to memory-ordering issues. But since this function + * is only intended to be used in cases where failing to write out the data * would be harmless anyway, it doesn't really matter. */ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != (BM_DIRTY | BM_JUST_DIRTIED)) { + XLogRecPtr lsn = InvalidXLogRecPtr; + bool dirtied = false; + bool delayChkpt = false; + + /* + * If checksums are enabled, then a full page image may be required + * even for some hint bit updates to protect against torn pages. This + * full page image is only necessary if the hint bit update is the + * first change to the page since the last checkpoint. + * + * We don't check full_page_writes here because that logic is in + * xlog.c. + */ + if (DataChecksumsEnabled()) + { + /* + * If we're in recovery we cannot dirty a page because of a hint. + * We can set the hint, just not dirty the page as a result so + * the hint is lost when we evict the page or shutdown. + * + * See long discussion in bufpage.c + */ + if (RecoveryInProgress()) + return; + + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full page + * image. Note that aggressive cleaning of blocks + * dirtied by hint bit setting would increase the call rate. + * Bulk setting of hint bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer dirty. + * Otherwise we might write the page before we write the WAL. + * That causes a race condition, since a checkpoint might occur + * between writing the WAL record and marking the buffer dirty. + * We solve that with a kluge, but one that is already in use + * during transaction commit to prevent race conditions. + * Basically, we simply prevent the checkpoint WAL record from + * being written until we have marked the buffer dirty. We don't + * start the checkpoint flush until we have marked dirty, so our + * checkpoint must flush the change to disk successfully or the + * checkpoint never gets written, so crash recovery will fix. + * + * It's possible we may enter here without an xid, so it is + * essential that CreateCheckpoint waits for virtual transactions + * rather than full transactionids. + */ + MyPgXact->delayChkpt = delayChkpt = true; + lsn = XLogSaveBufferForHint(buffer); + } + LockBufHdr(bufHdr); Assert(bufHdr->refcount > 0); if (!(bufHdr->flags & BM_DIRTY)) { - /* Do vacuum cost accounting */ + dirtied = true; /* Means "will be dirtied by this action" */ + + /* + * Set the page LSN if we wrote a backup block. We aren't + * supposed to set this when only holding a share lock but + * as long as we serialise it somehow we're OK. We choose to + * set LSN while holding the buffer header lock, which causes + * any reader of an LSN who holds only a share lock to also + * obtain a buffer header lock before using PageGetLSN(). + * Fortunately, thats not too many places. + * + * If checksums are enabled, you might think we should reset the + * checksum here. That will happen when the page is written + * sometime later in this checkpoint cycle. + */ + if (!XLogRecPtrIsInvalid(lsn)) + { + PageSetLSN(page, lsn); + } + } + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + UnlockBufHdr(bufHdr); + + if (delayChkpt) + MyPgXact->delayChkpt = false; + + if (dirtied) + { VacuumPageDirty++; if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - UnlockBufHdr(bufHdr); } } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 30dc809..2ec8058 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -201,6 +201,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + /* XXX do we want to write checksums for local buffers? An option? */ + /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 9c2afc5..b76bf9b 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -216,7 +216,7 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); } @@ -286,7 +286,7 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; @@ -583,6 +583,8 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) while (fsm_nblocks_now < fsm_nblocks) { + PageSetChecksumInplace(pg, fsm_nblocks_now); + smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, (char *) pg, false); fsm_nblocks_now++; @@ -617,7 +619,7 @@ fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, page = BufferGetPage(buf); if (fsm_set_avail(page, slot, newValue)) - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); if (minValue != 0) { @@ -768,7 +770,7 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index acb8038..19c8e09 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -284,7 +284,7 @@ restart: exclusive_lock_held = true; } fsm_rebuild_page(page); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); goto restart; } } diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 95f3e16..b4cf68e 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -15,7 +15,13 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/xlog.h" +static char pageCopyData[BLCKSZ]; /* for checksum calculation */ +static Page pageCopy = pageCopyData; + +static bool PageChecksumOK(Page page, BlockNumber blkno); +static uint16 PageCalcChecksum16(Page page, BlockNumber blkno); /* ---------------------------------------------------------------- * Page support functions @@ -25,6 +31,8 @@ /* * PageInit * Initializes the contents of a page. + * Note that we don't calculate an initial checksum here; that's not done + * until it's time to write. */ void PageInit(Page page, Size pageSize, Size specialSize) @@ -39,7 +47,10 @@ PageInit(Page page, Size pageSize, Size specialSize) /* Make sure all fields of page are zero, as well as unused space */ MemSet(p, 0, pageSize); - /* p->pd_flags = 0; done by above MemSet */ + if (DataChecksumsEnabled()) + p->pd_flags = PD_CHECKSUMS1 | PD_CHECKSUMS2; + else + p->pd_flags = 0; p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; @@ -49,7 +60,7 @@ PageInit(Page page, Size pageSize, Size specialSize) /* - * PageHeaderIsValid + * PageIsVerified * Check that the header fields of a page appear valid. * * This is called when a page has just been read in from disk. The idea is @@ -67,21 +78,27 @@ PageInit(Page page, Size pageSize, Size specialSize) * will clean up such a page and make it usable. */ bool -PageHeaderIsValid(PageHeader page) +PageIsVerified(Page page, BlockNumber blkno) { + PageHeader p = (PageHeader) page; char *pagebytes; int i; - /* Check normal case */ - if (PageGetPageSize(page) == BLCKSZ && - PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION && - (page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && - page->pd_lower >= SizeOfPageHeaderData && - page->pd_lower <= page->pd_upper && - page->pd_upper <= page->pd_special && - page->pd_special <= BLCKSZ && - page->pd_special == MAXALIGN(page->pd_special)) - return true; + /* + * Don't verify page data unless the page passes basic non-zero test + */ + if (!PageIsNew(page)) + { + /* Check normal case */ + if (PageChecksumOK(page, blkno) && + (p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && + (p->pd_flags & PD_HEADERCHECK) == 0 && + p->pd_lower <= p->pd_upper && + p->pd_upper <= p->pd_special && + p->pd_special <= BLCKSZ && + p->pd_special == MAXALIGN(p->pd_special)) + return true; + } /* Check all-zeroes case */ pagebytes = (char *) page; @@ -827,3 +844,231 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) pfree(itemidbase); } + +/* + * Test whether the page checksum is correct or not. + * + * IMPORTANT NOTE - + * The checksum is not valid at all times on a data page. We set it before we + * flush page/buffer, and implicitly invalidate the checksum when we modify the + * page. A heavily accessed buffer might then spend most of its life with an + * invalid page checksum, so testing random pages in the buffer pool will tell + * you nothing. The reason for this is that the checksum detects otherwise + * silent errors caused by the filesystems on which we rely. We do not protect + * buffers against uncorrectable memory errors, since these have a very low + * measured incidence according to research on large server farms, + * http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22. + * + * That means that WAL-logged changes to a page do NOT update the page + * checksum, so full page images may not have a valid checksum. But those page + * images have the WAL CRC covering them and so are verified separately from + * this mechanism. + * + * Any write of a data block can cause a torn page if the write is unsuccessful. + * Full page writes protect us from that, which are stored in WAL. Setting + * hint bits when a page is already dirty is OK because a full page write + * must already have been written for that since the last checkpoint. + * Setting hint bits on an otherwise clean page can allow torn pages; this + * doesn't normally matter since they are just hints. When the page has + * checksums, losing a few bits would cause the checksum to be invalid. + * So if we have full_page_writes = on and checksums enabled then we must + * write a WAL record specifically so that we record a full page image in WAL. + * New WAL records cannot be written during recovery, so hint bits set + * during recovery must not dirty the page if the buffer is not already dirty, + * when checksums are enabled. + * + * WAL replay ignores page checksums unless it writes out or reads in blocks + * from disk; restoring full page images does not verify checksums via this + * function. + * + * The best way to understand this is that WAL CRCs protect records entering + * the WAL stream, and page verification protects blocks entering the shared + * buffer pool. They are similar in purpose, yet completely separate. + * Together they ensure we are able to detect errors in data re-entering + * PostgreSQL-controlled memory. Note also that the WAL checksum is a + * 32-bit CRC, whereas the page checksum is a Fletcher checksum, not a CRC. + * + * This function returns a boolean, not a full damage assessment. + */ +static bool +PageChecksumOK(Page page, BlockNumber blkno) +{ + PageHeader p = (PageHeader) page; + uint16 checksum; + uint16 checksum_mask = PD_CHECKSUMS1 | PD_CHECKSUMS2; + + /* Quick exit if nobody cares about checksumming */ + if (!DataChecksumsEnabled()) + { + /* + * We don't verify that the checksum itself is zero here, because pages + * upgraded from previous versions may still hold the TLI in the + * checksum field. + */ + if ((p->pd_flags & checksum_mask) != 0) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("unexpected checksum flags on page"))); + return false; + } + return true; + } + + if ((p->pd_flags & checksum_mask) != checksum_mask) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("checksum flags missing on page"))); + return false; + } + + checksum = PageCalcChecksum16(page, blkno); + + if (checksum != p->pd_checksum) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, p->pd_checksum))); + return false; + } + + return true; +} + +/* + * Set checksum for page in shared buffers. + * + * If checksums are disabled, or if the page is not initialized, just return + * the input. Otherwise, we must make a copy of the page before calculating the + * checksum, to prevent concurrent modifications (e.g. setting hint bits) from + * making the final checksum invalid. + * + * Returns a pointer to the block-sized data that needs to be written. Uses + * statically-allocated memory, so the caller must immediately write the + * returned page and not refer to it again. + */ +char * +PageSetChecksumCopy(Page page, BlockNumber blkno) +{ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return (char *) page; + + /* + * We make a copy iff we need to calculate a checksum because other + * backends may set hint bits on this page while we write, which + * would mean the checksum differs from the page contents. It doesn't + * matter if we include or exclude hints during the copy, as long + * as we write a valid page and associated checksum. + */ + memcpy((char *) pageCopy, (char *) page, BLCKSZ); + PageSetChecksumInplace(pageCopy, blkno); + return (char *) pageCopy; +} + +/* + * Set checksum for page in private memory. + * + * This is a simpler version of PageSetChecksumCopy(). The more explicit API + * allows us to more easily see if we're making the correct call and reduces + * the amount of additional code specific to page verification. + */ +void +PageSetChecksumInplace(Page page, BlockNumber blkno) +{ + if (PageIsNew(page)) + return; + + if (DataChecksumsEnabled()) + { + PageHeader p = (PageHeader) page; + p->pd_checksum = PageCalcChecksum16(page, blkno); + } + + return; +} + +/* + * Calculate checksum for a PostgreSQL Page. This includes the page number (to + * detect the case when a page is somehow moved to a different location), the + * page header (excluding the checksum itself), and the page data. + * + * The checksum algorithm is a modified Fletcher 64-bit (which is + * order-sensitive). The modification is because, at the end, we have two + * 64-bit sums, but we only have room for a 16-bit checksum. So, instead of + * using a modulus of 2^32 - 1, we use 2^8 - 1; making it also resemble a + * Fletcher 16-bit. We don't use Fletcher 16-bit directly, because processing + * single bytes at a time is slower. + */ +static uint16 +PageCalcChecksum16(Page page, BlockNumber blkno) +{ + PageHeaderData header_copy; + uint32 *ptr32Header = (uint32 *) &header_copy; + uint32 *ptr32Page = (uint32 *) page; + int64 sum1 = 0; + int64 sum2 = 0; + uint16 checksum = 0; + uint8 *p8Checksum = (uint8 *) &checksum; + int i; + + /* only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + /* + * Initialize the checksum calculation with the page number. This helps + * catch corruption from whole pages being transposed with other whole + * pages. + */ + sum1 = sum2 = (uint64) blkno; + + /* + * Make a copy of the page header and set the checksum to zero in the + * copy. That allows us to calculate the checksum 32 bits at a time while + * ignoring only the checksum field during calculation. + */ + memcpy(&header_copy, page, SizeOfPageHeaderData); + header_copy.pd_checksum = 0; + + /* compute the checksum of the header */ + for (i = 0; i < SizeOfPageHeaderData / sizeof(uint32); i++) + { + sum1 += ptr32Header[i]; + sum2 += sum1; + } + + /* now checksum the rest of the page */ + for (i = SizeOfPageHeaderData; i < BLCKSZ / sizeof(uint32); i++) + { + sum1 += ptr32Page[i]; + sum2 += sum1; + + /* + * Testing for overflow makes the algorithm slower, but we know that + * overflow won't happen, so only use an Assert. The overflow won't + * happen because sum2 (the larger sum) can grow to a maximum of: + * + * 2^32 * (N^2 - N)/2 + * + * where N is the number of iterations of this loop. The largest block + * size is 32KB, which is 8192 iterations, which yields a number less + * than 2^61, which is still within the range of a signed int64. + */ + Assert(BLCKSZ <= 32768 && sum1 >=0 && sum2 >= 0); + } + + /* + * Store the sums as bytes in the checksum. We add one to shift the range + * from 0..255 to 1..256, to make zero invalid for checksum bytes (which + * seems wise). + */ + p8Checksum[0] = (sum1 % 255) + 1; + p8Checksum[1] = (sum2 % 255) + 1; + +#ifdef DEBUG_CHECKSUM + elog(LOG, "checksum %u", checksum); +#endif + + return checksum; +} diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index f2c9ff2..24384b4 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -6,7 +6,7 @@ * NOTE: all the HeapTupleSatisfies routines will update the tuple's * "hint" status bits if we see that the inserting or deleting transaction * has now committed or aborted (and it is safe to set the hint bits). - * If the hint bits are changed, SetBufferCommitInfoNeedsSave is called on + * If the hint bits are changed, MarkBufferDirtyHint is called on * the passed-in buffer. The caller must hold not only a pin, but at least * shared buffer content lock on the buffer containing the tuple. * @@ -121,7 +121,7 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } tuple->t_infomask |= infomask; - SetBufferCommitInfoNeedsSave(buffer); + MarkBufferDirtyHint(buffer); } /* diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index b501132..27f1d6b 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -120,6 +120,7 @@ static bool noclean = false; static bool do_sync = true; static bool sync_only = false; static bool show_setting = false; +static bool data_checksums = false; static char *xlog_dir = ""; @@ -1442,8 +1443,10 @@ bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -x1 %s %s", - backend_exec, boot_options, talkargs); + "\"%s\" --boot -x1 %s %s %s", + backend_exec, + data_checksums ? "-k" : "", + boot_options, talkargs); PG_CMD_OPEN; @@ -2749,6 +2752,7 @@ usage(const char *progname) printf(_(" -X, --xlogdir=XLOGDIR location for the transaction log directory\n")); printf(_("\nLess commonly used options:\n")); printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -k, --data-checksums data page checksums\n")); printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --noclean do not clean up after errors\n")); printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n")); @@ -3425,6 +3429,7 @@ main(int argc, char *argv[]) {"nosync", no_argument, NULL, 'N'}, {"sync-only", no_argument, NULL, 'S'}, {"xlogdir", required_argument, NULL, 'X'}, + {"data-checksums", no_argument, NULL, 'k'}, {NULL, 0, NULL, 0} }; @@ -3456,7 +3461,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sST:X:", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "dD:E:kL:nNU:WA:sST:X:", long_options, &option_index)) != -1) { switch (c) { @@ -3505,6 +3510,9 @@ main(int argc, char *argv[]) case 'S': sync_only = true; break; + case 'k': + data_checksums = true; + break; case 'L': share_path = pg_strdup(optarg); break; diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index cab2568..f959f7e 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -287,5 +287,7 @@ main(int argc, char *argv[]) (ControlFile.float4ByVal ? _("by value") : _("by reference"))); printf(_("Float8 argument passing: %s\n"), (ControlFile.float8ByVal ? _("by value") : _("by reference"))); + printf(_("Data page checksums: %s\n"), + (ControlFile.data_checksums ? _("enabled") : _("disabled"))); return 0; } diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index f075b6e..a912fef 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -624,6 +624,8 @@ PrintControlValues(bool guessed) (ControlFile.float4ByVal ? _("by value") : _("by reference"))); printf(_("Float8 argument passing: %s\n"), (ControlFile.float8ByVal ? _("by value") : _("by reference"))); + printf(_("Data page checksums: %s\n"), + (ControlFile.data_checksums ? _("enabled") : _("disabled"))); } diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 270924a..e58eae5 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -279,7 +279,7 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, MultiXactId cutoff_multi, OffsetNumber *offsets, int offcnt); -extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block, +extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, Page page); diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index 99b2dc5..43789c2 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -24,8 +24,8 @@ extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); -extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, - XLogRecPtr recptr, Buffer vmbuf, TransactionId cutoff_xid); +extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid); extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern BlockNumber visibilitymap_count(Relation rel); extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 8a65492..a24f661 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -267,6 +267,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr); extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); extern int XLogFileOpen(XLogSegNo segno); +extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer); + extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern void XLogSetAsyncXactLSN(XLogRecPtr record); @@ -295,6 +297,7 @@ extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); +extern bool DataChecksumsEnabled(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); extern void BootStrapXLOG(void); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 306d188..52abe63 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -67,6 +67,7 @@ typedef struct CheckPoint #define XLOG_RESTORE_POINT 0x70 #define XLOG_FPW_CHANGE 0x80 #define XLOG_END_OF_RECOVERY 0x90 +#define XLOG_HINT 0xA0 /* @@ -212,6 +213,9 @@ typedef struct ControlFileData bool float4ByVal; /* float4 pass-by-value? */ bool float8ByVal; /* float8, int8, etc pass-by-value? */ + /* Are data pages protected by checksums? */ + bool data_checksums; + /* CRC of all above ... MUST BE LAST! */ pg_crc32 crc; } ControlFileData; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 2ad536b..273e6d1 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -203,7 +203,7 @@ extern Size BufferShmemSize(void); extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum); -extern void SetBufferCommitInfoNeedsSave(Buffer buffer); +extern void MarkBufferDirtyHint(Buffer buffer); extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 6237fcc..5223b8e 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -15,6 +15,7 @@ #define BUFPAGE_H #include "access/xlogdefs.h" +#include "storage/block.h" #include "storage/item.h" #include "storage/off.h" @@ -163,14 +164,33 @@ typedef PageHeaderData *PageHeader; * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. + * + * PD_CHECKSUMS1 and PD_CHECKSUMS2 indicate the presence of checksums. This + * allows future support for enabling/disabling the use of checksums while the + * system is online. There is some concern that trusting page data to say how + * to check page data is dangerously self-referential. To avoid falsely + * determining that the page has no checksum, we set two non-adjacent bits to + * signify that the page has a checksum and should be verified when that block + * is read back into a buffer. We use two bits in case a multiple bit error + * removes one of the checksum flags *and* destroys data, which would lead to + * skipping the checksum check and silently accepting bad data. We also require + * that a third bit (PD_HEADERCHECK) is zeroed regardless of the presence of a + * checksum. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ +#define PD_CHECKSUMS1 0x0008 /* bit indicating the presence of + * checksums */ +#define PD_HEADERCHECK 0x0010 /* always zero -- if set, indicates + * corruption */ + +#define PD_CHECKSUMS2 0x8000 /* bit indicating the presence of + * checksums */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x801F /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -378,7 +398,7 @@ do { \ */ extern void PageInit(Page page, Size pageSize, Size specialSize); -extern bool PageHeaderIsValid(PageHeader page); +extern bool PageIsVerified(Page page, BlockNumber blkno); extern OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap); extern Page PageGetTempPage(Page page); @@ -391,5 +411,7 @@ extern Size PageGetExactFreeSpace(Page page); extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); +extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); +extern void PageSetChecksumInplace(Page page, BlockNumber blkno); #endif /* BUFPAGE_H */