*** a/contrib/pg_upgrade/controldata.c --- b/contrib/pg_upgrade/controldata.c *************** *** 56,61 **** get_control_data(ClusterInfo *cluster, bool live_check) --- 56,62 ---- bool got_toast = false; bool got_date_is_int = false; bool got_float8_pass_by_value = false; + bool got_data_checksums = false; char *lc_collate = NULL; char *lc_ctype = NULL; char *lc_monetary = NULL; *************** *** 131,136 **** get_control_data(ClusterInfo *cluster, bool live_check) --- 132,144 ---- got_float8_pass_by_value = true; } + /* Only in <= 9.2 */ + if (GET_MAJOR_VERSION(cluster->major_version) <= 902) + { + cluster->controldata.data_checksums = false; + got_data_checksums = true; + } + /* we have the result of cmd in "output". so parse it line by line now */ while (fgets(bufin, sizeof(bufin), output)) { *************** *** 393,398 **** get_control_data(ClusterInfo *cluster, bool live_check) --- 401,418 ---- cluster->controldata.float8_pass_by_value = strstr(p, "by value") != NULL; got_float8_pass_by_value = true; } + else if ((p = strstr(bufin, "checksums")) != NULL) + { + p = strchr(p, ':'); + + if (p == NULL || strlen(p) <= 1) + pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__); + + p++; /* removing ':' char */ + /* used later for contrib check */ + cluster->controldata.data_checksums = strstr(p, "enabled") != NULL; + got_data_checksums = true; + } /* In pre-8.4 only */ else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL) { *************** *** 475,481 **** get_control_data(ClusterInfo *cluster, bool live_check) !got_tli || !got_align || !got_blocksz || !got_largesz || !got_walsz || !got_walseg || !got_ident || !got_index || !got_toast || ! !got_date_is_int || !got_float8_pass_by_value) { pg_log(PG_REPORT, "The %s cluster lacks some required control information:\n", --- 495,501 ---- !got_tli || !got_align || !got_blocksz || !got_largesz || !got_walsz || !got_walseg || !got_ident || !got_index || !got_toast || ! !got_date_is_int || !got_float8_pass_by_value || !got_data_checksums) { pg_log(PG_REPORT, "The %s cluster lacks some required control information:\n", *************** *** 533,538 **** get_control_data(ClusterInfo *cluster, bool live_check) --- 553,562 ---- if (!got_float8_pass_by_value) pg_log(PG_REPORT, " float8 argument passing method\n"); + /* value added in Postgres 9.3 */ + if (!got_data_checksums) + pg_log(PG_REPORT, " data checksums\n"); + pg_log(PG_FATAL, "Cannot continue without required control information, terminating\n"); } *************** *** 594,599 **** check_control_data(ControlData *oldctrl, --- 618,629 ---- "--disable-integer-datetimes or get server binaries built with those\n" "options.\n"); } + + if (oldctrl->data_checksums != newctrl->data_checksums) + { + pg_log(PG_FATAL, + "old and new pg_controldata checksums settings are invalid or do not match\n"); + } } *** a/contrib/pg_upgrade/pg_upgrade.h --- b/contrib/pg_upgrade/pg_upgrade.h *************** *** 199,204 **** typedef struct --- 199,205 ---- uint32 toast; bool date_is_int; bool float8_pass_by_value; + bool data_checksums; char *lc_collate; char *lc_ctype; char *encoding; *** a/doc/src/sgml/ref/initdb.sgml --- b/doc/src/sgml/ref/initdb.sgml *************** *** 183,188 **** PostgreSQL documentation --- 183,201 ---- + + + + + Use checksums on data pages to help detect corruption by the + I/O system that would otherwise be silent. Enabling checksums + may incur a slight performance penalty. This option can only + be set during initialization, and cannot be changed later. See . + + + + + *** a/doc/src/sgml/wal.sgml --- b/doc/src/sgml/wal.sgml *************** *** 177,182 **** --- 177,208 ---- (BBU) disk controllers do not prevent partial page writes unless they guarantee that data is written to the BBU as full (8kB) pages. + + + Checksums + + + checksums + + + + Even data recorded to disk may be lost due to media failure or + other corruption. While PostgreSQL cannot do anything to prevent + such loss, checksums allow early detection of those + problems. Detecting such corruption quickly is crucial before + taking a backup or rebuilding a replication slave; otherwise, + there is a chance that the corruption could make it to the backup + or replica. + + + + The WAL is always protected by a checksum, which prevents + corrupted WAL records from being replayed during recovery. To + protect data pages, so that corrupt data pages aren't read into + shared memory, checksums must be enabled + using . + + *** a/src/backend/access/hash/hash.c --- b/src/backend/access/hash/hash.c *************** *** 287,295 **** hashgettuple(PG_FUNCTION_ARGS) /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the ! * buffer dirty but don't make a WAL log entry. */ ! SetBufferCommitInfoNeedsSave(buf); } /* --- 287,296 ---- /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the ! * buffer dirty, but avoid writing WAL unless we require a ! * full-page image (e.g. if checksums are enabled). */ ! MarkBufferDirtyHint(buf); } /* *** a/src/backend/access/hash/hashpage.c --- b/src/backend/access/hash/hashpage.c *************** *** 712,717 **** _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) --- 712,718 ---- MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); + /* no need to set page checksum for all-zero pages */ smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; *** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 4859,4865 **** l4: recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata); PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); --- 4859,4864 ---- *************** *** 5714,5730 **** log_heap_freeze(Relation reln, Buffer buffer, * being marked all-visible, and vm_buffer is the buffer containing the * corresponding visibility map block. Both should have already been modified * and dirtied. */ XLogRecPtr ! log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, TransactionId cutoff_xid) { xl_heap_visible xlrec; XLogRecPtr recptr; ! XLogRecData rdata[2]; xlrec.node = rnode; ! xlrec.block = block; xlrec.cutoff_xid = cutoff_xid; rdata[0].data = (char *) &xlrec; --- 5713,5735 ---- * being marked all-visible, and vm_buffer is the buffer containing the * corresponding visibility map block. Both should have already been modified * and dirtied. + * + * If checksums are enabled, we also add the heap_buffer to the chain to + * protect it from being torn. */ XLogRecPtr ! log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid) { xl_heap_visible xlrec; XLogRecPtr recptr; ! XLogRecData rdata[3]; ! ! Assert(BufferIsValid(heap_buffer)); ! Assert(BufferIsValid(vm_buffer)); xlrec.node = rnode; ! xlrec.block = BufferGetBlockNumber(heap_buffer); xlrec.cutoff_xid = cutoff_xid; rdata[0].data = (char *) &xlrec; *************** *** 5738,5743 **** log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, --- 5743,5759 ---- rdata[1].buffer_std = false; rdata[1].next = NULL; + if (DataChecksumsEnabled()) + { + rdata[1].next = &(rdata[2]); + + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].buffer = heap_buffer; + rdata[2].buffer_std = true; + rdata[2].next = NULL; + } + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata); return recptr; *************** *** 6099,6106 **** static void heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) { xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); - Buffer buffer; - Page page; /* * If there are any Hot Standby transactions running that have an xmin --- 6115,6120 ---- *************** *** 6115,6153 **** heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node); /* ! * Read the heap page, if it still exists. If the heap file has been ! * dropped or truncated later in recovery, we don't need to update the ! * page, but we'd better still update the visibility map. */ ! buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, ! RBM_NORMAL); ! if (BufferIsValid(buffer)) { ! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); ! ! page = (Page) BufferGetPage(buffer); /* ! * We don't bump the LSN of the heap page when setting the visibility ! * map bit, because that would generate an unworkable volume of ! * full-page writes. This exposes us to torn page hazards, but since ! * we're not inspecting the existing page contents in any way, we ! * don't care. ! * ! * However, all operations that clear the visibility map bit *do* bump ! * the LSN, and those operations will only be replayed if the XLOG LSN ! * follows the page LSN. Thus, if the page LSN has advanced past our ! * XLOG record's LSN, we mustn't mark the page all-visible, because ! * the subsequent update won't be replayed to clear the flag. */ ! if (lsn > PageGetLSN(page)) { ! PageSetAllVisible(page); ! MarkBufferDirty(buffer); ! } ! /* Done with heap page. */ ! UnlockReleaseBuffer(buffer); } /* --- 6129,6184 ---- ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node); /* ! * If heap block was backed up, restore it. This can only happen with ! * checksums enabled. */ ! if (record->xl_info & XLR_BKP_BLOCK(1)) { ! Assert(DataChecksumsEnabled()); ! (void) RestoreBackupBlock(lsn, record, 1, false, false); ! } ! else ! { ! Buffer buffer; ! Page page; /* ! * Read the heap page, if it still exists. If the heap file has been ! * dropped or truncated later in recovery, we don't need to update the ! * page, but we'd better still update the visibility map. */ ! buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, ! xlrec->block, RBM_NORMAL); ! if (BufferIsValid(buffer)) { ! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); ! page = (Page) BufferGetPage(buffer); ! ! /* ! * We don't bump the LSN of the heap page when setting the ! * visibility map bit (unless checksums are enabled, in which case ! * we must), because that would generate an unworkable volume of ! * full-page writes. This exposes us to torn page hazards, but ! * since we're not inspecting the existing page contents in any ! * way, we don't care. ! * ! * However, all operations that clear the visibility map bit *do* ! * bump the LSN, and those operations will only be replayed if the ! * XLOG LSN follows the page LSN. Thus, if the page LSN has ! * advanced past our XLOG record's LSN, we mustn't mark the page ! * all-visible, because the subsequent update won't be replayed to ! * clear the flag. ! */ ! if (lsn > PageGetLSN(page)) ! { ! PageSetAllVisible(page); ! MarkBufferDirty(buffer); ! } ! ! /* Done with heap page. */ ! UnlockReleaseBuffer(buffer); ! } } /* *************** *** 6178,6184 **** heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) * real harm is done; and the next VACUUM will fix it. */ if (lsn > PageGetLSN(BufferGetPage(vmbuffer))) ! visibilitymap_set(reln, xlrec->block, lsn, vmbuffer, xlrec->cutoff_xid); ReleaseBuffer(vmbuffer); --- 6209,6215 ---- * real harm is done; and the next VACUUM will fix it. */ if (lsn > PageGetLSN(BufferGetPage(vmbuffer))) ! visibilitymap_set(reln, xlrec->block, InvalidBuffer, lsn, vmbuffer, xlrec->cutoff_xid); ReleaseBuffer(vmbuffer); *************** *** 6927,6933 **** heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } --- 6958,6963 ---- *** a/src/backend/access/heap/pruneheap.c --- b/src/backend/access/heap/pruneheap.c *************** *** 262,268 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); ! SetBufferCommitInfoNeedsSave(buffer); } } --- 262,268 ---- { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); ! MarkBufferDirtyHint(buffer); } } *** a/src/backend/access/heap/rewriteheap.c --- b/src/backend/access/heap/rewriteheap.c *************** *** 273,278 **** end_heap_rewrite(RewriteState state) --- 273,280 ---- /* Write the last page, if any */ if (state->rs_buffer_valid) { + PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); + if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, *************** *** 614,619 **** raw_heap_insert(RewriteState state, HeapTuple tup) --- 616,623 ---- { /* Doesn't fit, so write out the existing page */ + PageSetChecksumInplace(page, state->rs_blockno); + /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, *** a/src/backend/access/heap/visibilitymap.c --- b/src/backend/access/heap/visibilitymap.c *************** *** 233,245 **** visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void ! visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, ! Buffer buf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); --- 233,250 ---- * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * + * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling + * this function. Except in recovery, caller should also pass the heap + * buffer. When checksums are enabled and we're not in recovery, we must add + * the heap buffer to the WAL chain to protect it from being torn. + * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void ! visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, ! XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); *************** *** 252,285 **** visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); ! /* Check that we have the right page pinned */ ! if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock) ! elog(ERROR, "wrong buffer passed to visibilitymap_set"); ! page = BufferGetPage(buf); map = PageGetContents(page); ! LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); ! MarkBufferDirty(buf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) ! recptr = log_heap_visible(rel->rd_node, heapBlk, buf, cutoff_xid); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } ! LockBuffer(buf, BUFFER_LOCK_UNLOCK); } /* --- 257,311 ---- #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); + Assert(InRecovery || BufferIsValid(heapBuf)); ! /* Check that we have the right heap page pinned, if present */ ! if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) ! elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); ! /* Check that we have the right VM page pinned */ ! if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) ! elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); ! ! page = BufferGetPage(vmBuf); map = PageGetContents(page); ! LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); ! MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) ! { ! Assert(!InRecovery); ! recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); + + /* + * If data checksums are enabled, we need to protect the heap + * page from being torn. + */ + if (DataChecksumsEnabled()) + { + Page heapPage = BufferGetPage(heapBuf); + + /* caller is expected to set PD_ALL_VISIBLE first */ + Assert(PageIsAllVisible(heapPage)); + PageSetLSN(heapPage, recptr); + } + } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } ! LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); } /* *************** *** 579,584 **** vm_extend(Relation rel, BlockNumber vm_nblocks) --- 605,612 ---- /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { + PageSetChecksumInplace(pg, vm_nblocks_now); + smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, (char *) pg, false); vm_nblocks_now++; *** a/src/backend/access/nbtree/nbtinsert.c --- b/src/backend/access/nbtree/nbtinsert.c *************** *** 405,415 **** _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; ! /* be sure to mark the proper buffer dirty... */ if (nbuf != InvalidBuffer) ! SetBufferCommitInfoNeedsSave(nbuf); else ! SetBufferCommitInfoNeedsSave(buf); } } } --- 405,420 ---- */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; ! ! /* ! * Be sure to mark the proper buffer dirty. If checksums ! * are enabled, this may also require a full-page image ! * (see comments in MarkBufferDirtyHint). ! */ if (nbuf != InvalidBuffer) ! MarkBufferDirtyHint(nbuf); else ! MarkBufferDirtyHint(buf); } } } *** a/src/backend/access/nbtree/nbtree.c --- b/src/backend/access/nbtree/nbtree.c *************** *** 217,222 **** btbuildempty(PG_FUNCTION_ARGS) --- 217,223 ---- _bt_initmetapage(metapage, P_NONE, 0); /* Write the page. If archiving/streaming, XLOG it. */ + PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); if (XLogIsNeeded()) *************** *** 1051,1057 **** restart: opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; ! SetBufferCommitInfoNeedsSave(buf); } } --- 1052,1058 ---- opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; ! MarkBufferDirtyHint(buf); } } *** a/src/backend/access/nbtree/nbtsort.c --- b/src/backend/access/nbtree/nbtsort.c *************** *** 284,295 **** _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) --- 284,298 ---- { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + /* no need to set checksum for all-zero pages */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } + PageSetChecksumInplace(page, blkno); + /* * Now write the page. There's no need for smgr to schedule an fsync for * this write; we'll do it ourselves before ending the build. *** a/src/backend/access/nbtree/nbtutils.c --- b/src/backend/access/nbtree/nbtutils.c *************** *** 1783,1789 **** _bt_killitems(IndexScanDesc scan, bool haveLock) /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty ! * but don't make a WAL log entry. * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. --- 1783,1790 ---- /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty ! * but avoid writing WAL unless we require a full-page image (e.g. if ! * checksums are enabled). * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. *************** *** 1791,1797 **** _bt_killitems(IndexScanDesc scan, bool haveLock) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; ! SetBufferCommitInfoNeedsSave(so->currPos.buf); } if (!haveLock) --- 1792,1798 ---- if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; ! MarkBufferDirtyHint(so->currPos.buf); } if (!haveLock) *** a/src/backend/access/rmgrdesc/xlogdesc.c --- b/src/backend/access/rmgrdesc/xlogdesc.c *************** *** 79,84 **** xlog_desc(StringInfo buf, uint8 xl_info, char *rec) --- 79,88 ---- appendStringInfo(buf, "restore point: %s", xlrec->rp_name); } + else if (info == XLOG_HINT) + { + appendStringInfo(buf, "page hint"); + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; *** a/src/backend/access/spgist/spginsert.c --- b/src/backend/access/spgist/spginsert.c *************** *** 154,159 **** spgbuildempty(PG_FUNCTION_ARGS) --- 154,160 ---- SpGistInitMetapage(page); /* Write the page. If archiving/streaming, XLOG it. */ + PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); if (XLogIsNeeded()) *************** *** 163,168 **** spgbuildempty(PG_FUNCTION_ARGS) --- 164,170 ---- /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); + PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, (char *) page, true); if (XLogIsNeeded()) *************** *** 172,177 **** spgbuildempty(PG_FUNCTION_ARGS) --- 174,180 ---- /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); + PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, (char *) page, true); if (XLogIsNeeded()) *** a/src/backend/access/transam/xlog.c --- b/src/backend/access/transam/xlog.c *************** *** 61,66 **** --- 61,67 ---- #include "utils/timestamp.h" #include "pg_trace.h" + extern bool bootstrap_data_checksums; /* File path names (all relative to $PGDATA) */ #define RECOVERY_COMMAND_FILE "recovery.conf" *************** *** 699,704 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) --- 700,706 ---- bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT); uint8 info_orig = info; static XLogRecord *rechdr; *************** *** 969,974 **** begin:; --- 971,988 ---- } /* + * If this is a hint record and we don't need a backup block then + * we have no more work to do and can exit quickly without inserting + * a WAL record at all. In that case return InvalidXLogRecPtr. + */ + if (isHint && !(info & XLR_BKP_BLOCK_MASK)) + { + LWLockRelease(WALInsertLock); + END_CRIT_SECTION(); + return InvalidXLogRecPtr; + } + + /* * If the current page is completely full, the record goes to the next * page, right after the page header. */ *************** *** 3156,3161 **** RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, --- 3170,3180 ---- BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); } + /* + * Any checksum set on this page will be invalid. We don't need + * to reset it here since it will be set before being written. + */ + PageSetLSN(page, lsn); MarkBufferDirty(buffer); *************** *** 3682,3687 **** GetSystemIdentifier(void) --- 3701,3716 ---- } /* + * Are checksums enabled for data pages? + */ + bool + DataChecksumsEnabled(void) + { + Assert(ControlFile != NULL); + return ControlFile->data_checksums; + } + + /* * Auto-tune the number of XLOG buffers. * * The preferred setting for wal_buffers is about 3% of shared_buffers, with *************** *** 3979,3984 **** BootStrapXLOG(void) --- 4008,4014 ---- ControlFile->max_prepared_xacts = max_prepared_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; + ControlFile->data_checksums = bootstrap_data_checksums; /* some additional ControlFile fields are set in WriteControlFile() */ *************** *** 7291,7296 **** XLogRestorePoint(const char *rpName) --- 7321,7371 ---- } /* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Deciding the "if needed" part is delicate and requires us to either + * grab WALInsertLock or check the info_lck spinlock. If we check the + * spinlock and it says Yes then we will need to get WALInsertLock as well, + * so the design choice here is to just go straight for the WALInsertLock + * and trust that calls to this function are minimised elsewhere. + * + * Callable while holding just share lock on the buffer content. + * + * Possible that multiple concurrent backends could attempt to write + * WAL records. In that case, more than one backup block may be recorded + * though that isn't important to the outcome and the backup blocks are + * likely to be identical anyway. + */ + #define XLOG_HINT_WATERMARK 13579 + XLogRecPtr + XLogSaveBufferForHint(Buffer buffer) + { + /* + * Make an XLOG entry reporting the hint + */ + XLogRecData rdata[2]; + int watermark = XLOG_HINT_WATERMARK; + + /* + * Not allowed to have zero-length records, so use a small watermark + */ + rdata[0].data = (char *) (&watermark); + rdata[0].len = sizeof(int); + rdata[0].buffer = InvalidBuffer; + rdata[0].buffer_std = false; + rdata[0].next = &(rdata[1]); + + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].next = NULL; + + return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata); + } + + /* * Check if any of the GUC parameters that are critical for hot standby * have changed, and update the value in pg_control file if necessary. */ *************** *** 7451,7458 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; ! /* Backup blocks are not used in xlog records */ ! Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_NEXTOID) { --- 7526,7533 ---- { uint8 info = record->xl_info & ~XLR_INFO_MASK; ! /* Backup blocks are not used in most xlog records */ ! Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_NEXTOID) { *************** *** 7624,7629 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record) --- 7699,7732 ---- { /* nothing to do here */ } + else if (info == XLOG_HINT) + { + #ifdef USE_ASSERT_CHECKING + int *watermark = (int *) XLogRecGetData(record); + #endif + + /* Check the watermark is correct for the hint record */ + Assert(*watermark == XLOG_HINT_WATERMARK); + + /* Backup blocks must be present for smgr hint records */ + Assert(record->xl_info & XLR_BKP_BLOCK_MASK); + + /* + * Hint records have no information that needs to be replayed. + * The sole purpose of them is to ensure that a hint bit does + * not cause a checksum invalidation if a hint bit write should + * cause a torn page. So the body of the record is empty but + * there must be one backup block. + * + * Since the only change in the backup block is a hint bit, + * there is no confict with Hot Standby. + * + * This also means there is no corresponding API call for this, + * so an smgr implementation has no need to implement anything. + * Which means nothing is needed in md.c etc + */ + RestoreBackupBlock(lsn, record, 0, false, false); + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; *** a/src/backend/bootstrap/bootstrap.c --- b/src/backend/bootstrap/bootstrap.c *************** *** 48,53 **** --- 48,55 ---- extern int optind; extern char *optarg; + bool bootstrap_data_checksums = false; + #define ALLOC(t, c) ((t *) calloc((unsigned)(c), sizeof(t))) *************** *** 233,239 **** AuxiliaryProcessMain(int argc, char *argv[]) /* If no -x argument, we are a CheckerProcess */ MyAuxProcType = CheckerProcess; ! while ((flag = getopt(argc, argv, "B:c:d:D:Fr:x:-:")) != -1) { switch (flag) { --- 235,241 ---- /* If no -x argument, we are a CheckerProcess */ MyAuxProcType = CheckerProcess; ! while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:-:")) != -1) { switch (flag) { *************** *** 259,264 **** AuxiliaryProcessMain(int argc, char *argv[]) --- 261,269 ---- case 'F': SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); break; + case 'k': + bootstrap_data_checksums = true; + break; case 'r': strlcpy(OutputFileName, optarg, MAXPGPATH); break; *** a/src/backend/commands/sequence.c --- b/src/backend/commands/sequence.c *************** *** 1115,1121 **** read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple) HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID; ! SetBufferCommitInfoNeedsSave(*buf); } seq = (Form_pg_sequence) GETSTRUCT(seqtuple); --- 1115,1121 ---- HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID; ! MarkBufferDirtyHint(*buf); } seq = (Form_pg_sequence) GETSTRUCT(seqtuple); *** a/src/backend/commands/tablecmds.c --- b/src/backend/commands/tablecmds.c *************** *** 8813,8818 **** copy_relation_data(SMgrRelation src, SMgrRelation dst, --- 8813,8820 ---- smgrread(src, forkNum, blkno, buf); + PageSetChecksumInplace(page, blkno); + /* XLOG stuff */ if (use_wal) log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page); *** a/src/backend/commands/vacuumlazy.c --- b/src/backend/commands/vacuumlazy.c *************** *** 670,677 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { PageSetAllVisible(page); MarkBufferDirty(buf); ! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, ! InvalidTransactionId); } UnlockReleaseBuffer(buf); --- 670,677 ---- { PageSetAllVisible(page); MarkBufferDirty(buf); ! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, ! vmbuffer, InvalidTransactionId); } UnlockReleaseBuffer(buf); *************** *** 900,907 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { PageSetAllVisible(page); MarkBufferDirty(buf); ! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, ! visibility_cutoff_xid); } else if (!all_visible_according_to_vm) { --- 900,907 ---- { PageSetAllVisible(page); MarkBufferDirty(buf); ! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, ! vmbuffer, visibility_cutoff_xid); } else if (!all_visible_according_to_vm) { *************** *** 911,918 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * allowed. Set the visibility map bit as well so that we get * back in sync. */ ! visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, ! visibility_cutoff_xid); } } --- 911,918 ---- * allowed. Set the visibility map bit as well so that we get * back in sync. */ ! visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, ! vmbuffer, visibility_cutoff_xid); } } *** a/src/backend/storage/buffer/bufmgr.c --- b/src/backend/storage/buffer/bufmgr.c *************** *** 34,39 **** --- 34,40 ---- #include #include "catalog/catalog.h" + #include "catalog/storage.h" #include "executor/instrument.h" #include "miscadmin.h" #include "pg_trace.h" *************** *** 459,465 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* check for garbage data */ ! if (!PageHeaderIsValid((PageHeader) bufBlock)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { --- 460,466 ---- } /* check for garbage data */ ! if (!PageIsVerified((Page) bufBlock, blockNum)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { *************** *** 654,667 **** BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * victim. We need lock to inspect the page LSN, so this * can't be done inside StrategyGetBuffer. */ ! if (strategy != NULL && ! XLogNeedsFlush(BufferGetLSN(buf)) && ! StrategyRejectBuffer(strategy, buf)) { ! /* Drop lock/pin and loop around for another buffer */ ! LWLockRelease(buf->content_lock); ! UnpinBuffer(buf, true); ! continue; } /* OK, do the I/O */ --- 655,677 ---- * victim. We need lock to inspect the page LSN, so this * can't be done inside StrategyGetBuffer. */ ! if (strategy != NULL) { ! XLogRecPtr lsn; ! ! /* Read the LSN while holding buffer header lock */ ! LockBufHdr(buf); ! lsn = BufferGetLSN(buf); ! UnlockBufHdr(buf); ! ! if (XLogNeedsFlush(lsn) && ! StrategyRejectBuffer(strategy, buf)) ! { ! /* Drop lock/pin and loop around for another buffer */ ! LWLockRelease(buf->content_lock); ! UnpinBuffer(buf, true); ! continue; ! } } /* OK, do the I/O */ *************** *** 1893,1898 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) --- 1903,1910 ---- ErrorContextCallback errcallback; instr_time io_start, io_time; + Block bufBlock; + char *bufToWrite; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns *************** *** 1918,1929 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes * they describe do. */ - recptr = BufferGetLSN(buf); XLogFlush(recptr); /* --- 1930,1952 ---- reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); + LockBufHdr(buf); + + /* + * Run PageGetLSN while holding header lock, since we don't have the + * buffer locked exclusively in all cases. + */ + recptr = BufferGetLSN(buf); + + /* To check if block content changes while flushing. - vadim 01/17/97 */ + buf->flags &= ~BM_JUST_DIRTIED; + UnlockBufHdr(buf); + /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes * they describe do. */ XLogFlush(recptr); /* *************** *** 1932,1949 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * we have the io_in_progress lock. */ ! /* To check if block content changes while flushing. - vadim 01/17/97 */ ! LockBufHdr(buf); ! buf->flags &= ~BM_JUST_DIRTIED; ! UnlockBufHdr(buf); if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! (char *) BufHdrGetBlock(buf), false); if (track_io_timing) --- 1955,1974 ---- * we have the io_in_progress lock. */ ! bufBlock = BufHdrGetBlock(buf); ! ! bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); + /* + * bufToWrite is either the shared buffer or a copy, as appropriate. + */ smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! bufToWrite, false); if (track_io_timing) *************** *** 2481,2502 **** IncrBufferRefCount(Buffer buffer) } /* ! * SetBufferCommitInfoNeedsSave * ! * Mark a buffer dirty when we have updated tuple commit-status bits in it. * ! * This is essentially the same as MarkBufferDirty, except that the caller ! * might have only share-lock instead of exclusive-lock on the buffer's ! * content lock. We preserve the distinction mainly as a way of documenting ! * that the caller has not made a critical data change --- the status-bit ! * update could be redone by someone else just as easily. Therefore, no WAL ! * log record need be generated, whereas calls to MarkBufferDirty really ought ! * to be associated with a WAL-entry-creating action. */ void ! SetBufferCommitInfoNeedsSave(Buffer buffer) { volatile BufferDesc *bufHdr; if (!BufferIsValid(buffer)) elog(ERROR, "bad buffer ID: %d", buffer); --- 2506,2529 ---- } /* ! * MarkBufferDirtyHint ! * ! * Mark a buffer dirty for non-critical changes. * ! * This is essentially the same as MarkBufferDirty, except: * ! * 1. The caller does not write WAL; so if checksums are enabled, we may need ! * to write an XLOG_HINT WAL record to protect against torn pages. ! * 2. The caller might have only share-lock instead of exclusive-lock on the ! * buffer's content lock. ! * 3. This function does not guarantee that the buffer is always marked dirty ! * (due to a race condition), so it cannot be used for important changes. */ void ! MarkBufferDirtyHint(Buffer buffer) { volatile BufferDesc *bufHdr; + Page page = BufferGetPage(buffer); if (!BufferIsValid(buffer)) elog(ERROR, "bad buffer ID: %d", buffer); *************** *** 2516,2543 **** SetBufferCommitInfoNeedsSave(Buffer buffer) /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many ! * tuples. So, be as quick as we can if the buffer is already dirty. We ! * do this by not acquiring spinlock if it looks like the status bits are ! * already. Since we make this test unlocked, there's a chance we might ! * fail to notice that the flags have just been cleared, and failed to ! * reset them, due to memory-ordering issues. But since this function is ! * only intended to be used in cases where failing to write out the data * would be harmless anyway, it doesn't really matter. */ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != (BM_DIRTY | BM_JUST_DIRTIED)) { LockBufHdr(bufHdr); Assert(bufHdr->refcount > 0); if (!(bufHdr->flags & BM_DIRTY)) { ! /* Do vacuum cost accounting */ VacuumPageDirty++; if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - UnlockBufHdr(bufHdr); } } --- 2543,2647 ---- /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many ! * tuples. So, be as quick as we can if the buffer is already dirty. We do ! * this by not acquiring spinlock if it looks like the status bits are ! * already set. Since we make this test unlocked, there's a chance we ! * might fail to notice that the flags have just been cleared, and failed ! * to reset them, due to memory-ordering issues. But since this function ! * is only intended to be used in cases where failing to write out the data * would be harmless anyway, it doesn't really matter. */ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != (BM_DIRTY | BM_JUST_DIRTIED)) { + XLogRecPtr lsn = InvalidXLogRecPtr; + bool dirtied = false; + bool delayChkpt = false; + + /* + * If checksums are enabled, then a full page image may be required + * even for some hint bit updates to protect against torn pages. This + * full page image is only necessary if the hint bit update is the + * first change to the page since the last checkpoint. + * + * We don't check full_page_writes here because that logic is in + * xlog.c. + */ + if (DataChecksumsEnabled()) + { + /* + * If we're in recovery we cannot dirty a page because of a hint. + * We can set the hint, just not dirty the page as a result so + * the hint is lost when we evict the page or shutdown. + * + * See long discussion in bufpage.c + */ + if (RecoveryInProgress()) + return; + + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full page + * image. Note that aggressive cleaning of blocks + * dirtied by hint bit setting would increase the call rate. + * Bulk setting of hint bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer dirty. + * Otherwise we might write the page before we write the WAL. + * That causes a race condition, since a checkpoint might occur + * between writing the WAL record and marking the buffer dirty. + * We solve that with a kluge, but one that is already in use + * during transaction commit to prevent race conditions. + * Basically, we simply prevent the checkpoint WAL record from + * being written until we have marked the buffer dirty. We don't + * start the checkpoint flush until we have marked dirty, so our + * checkpoint must flush the change to disk successfully or the + * checkpoint never gets written, so crash recovery will fix. + * + * It's possible we may enter here without an xid, so it is + * essential that CreateCheckpoint waits for virtual transactions + * rather than full transactionids. + */ + MyPgXact->delayChkpt = delayChkpt = true; + lsn = XLogSaveBufferForHint(buffer); + } + LockBufHdr(bufHdr); Assert(bufHdr->refcount > 0); if (!(bufHdr->flags & BM_DIRTY)) { ! dirtied = true; /* Means "will be dirtied by this action" */ ! ! /* ! * Set the page LSN if we wrote a backup block. We aren't ! * supposed to set this when only holding a share lock but ! * as long as we serialise it somehow we're OK. We choose to ! * set LSN while holding the buffer header lock, which causes ! * any reader of an LSN who holds only a share lock to also ! * obtain a buffer header lock before using PageGetLSN(). ! * Fortunately, thats not too many places. ! * ! * If checksums are enabled, you might think we should reset the ! * checksum here. That will happen when the page is written ! * sometime later in this checkpoint cycle. ! */ ! if (!XLogRecPtrIsInvalid(lsn)) ! { ! PageSetLSN(page, lsn); ! } ! } ! bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); ! UnlockBufHdr(bufHdr); ! ! if (delayChkpt) ! MyPgXact->delayChkpt = false; ! ! if (dirtied) ! { VacuumPageDirty++; if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } } } *** a/src/backend/storage/buffer/localbuf.c --- b/src/backend/storage/buffer/localbuf.c *************** *** 200,205 **** LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, --- 200,207 ---- /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + /* XXX do we want to write checksums for local buffers? An option? */ + /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, *** a/src/backend/storage/freespace/freespace.c --- b/src/backend/storage/freespace/freespace.c *************** *** 216,222 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) ! MarkBufferDirty(buf); UnlockReleaseBuffer(buf); } --- 216,222 ---- PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) ! MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); } *************** *** 286,292 **** FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); ! MarkBufferDirty(buf); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; --- 286,292 ---- return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); ! MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; *************** *** 583,588 **** fsm_extend(Relation rel, BlockNumber fsm_nblocks) --- 583,590 ---- while (fsm_nblocks_now < fsm_nblocks) { + PageSetChecksumInplace(pg, fsm_nblocks_now); + smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, (char *) pg, false); fsm_nblocks_now++; *************** *** 617,623 **** fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, page = BufferGetPage(buf); if (fsm_set_avail(page, slot, newValue)) ! MarkBufferDirty(buf); if (minValue != 0) { --- 619,625 ---- page = BufferGetPage(buf); if (fsm_set_avail(page, slot, newValue)) ! MarkBufferDirtyHint(buf); if (minValue != 0) { *************** *** 768,774 **** fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); ! MarkBufferDirty(buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } --- 770,776 ---- { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); ! MarkBufferDirtyHint(buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } *** a/src/backend/storage/freespace/fsmpage.c --- b/src/backend/storage/freespace/fsmpage.c *************** *** 284,290 **** restart: exclusive_lock_held = true; } fsm_rebuild_page(page); ! MarkBufferDirty(buf); goto restart; } } --- 284,290 ---- exclusive_lock_held = true; } fsm_rebuild_page(page); ! MarkBufferDirtyHint(buf); goto restart; } } *** a/src/backend/storage/page/bufpage.c --- b/src/backend/storage/page/bufpage.c *************** *** 15,21 **** --- 15,27 ---- #include "postgres.h" #include "access/htup_details.h" + #include "access/xlog.h" + static char pageCopyData[BLCKSZ]; /* for checksum calculation */ + static Page pageCopy = pageCopyData; + + static bool PageChecksumOK(Page page, BlockNumber blkno); + static uint16 PageCalcChecksum16(Page page, BlockNumber blkno); /* ---------------------------------------------------------------- * Page support functions *************** *** 25,30 **** --- 31,38 ---- /* * PageInit * Initializes the contents of a page. + * Note that we don't calculate an initial checksum here; that's not done + * until it's time to write. */ void PageInit(Page page, Size pageSize, Size specialSize) *************** *** 39,45 **** PageInit(Page page, Size pageSize, Size specialSize) /* Make sure all fields of page are zero, as well as unused space */ MemSet(p, 0, pageSize); ! /* p->pd_flags = 0; done by above MemSet */ p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; --- 47,56 ---- /* Make sure all fields of page are zero, as well as unused space */ MemSet(p, 0, pageSize); ! if (DataChecksumsEnabled()) ! p->pd_flags = PD_CHECKSUMS1 | PD_CHECKSUMS2; ! else ! p->pd_flags = 0; p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; *************** *** 49,55 **** PageInit(Page page, Size pageSize, Size specialSize) /* ! * PageHeaderIsValid * Check that the header fields of a page appear valid. * * This is called when a page has just been read in from disk. The idea is --- 60,66 ---- /* ! * PageIsVerified * Check that the header fields of a page appear valid. * * This is called when a page has just been read in from disk. The idea is *************** *** 67,87 **** PageInit(Page page, Size pageSize, Size specialSize) * will clean up such a page and make it usable. */ bool ! PageHeaderIsValid(PageHeader page) { char *pagebytes; int i; ! /* Check normal case */ ! if (PageGetPageSize(page) == BLCKSZ && ! PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION && ! (page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && ! page->pd_lower >= SizeOfPageHeaderData && ! page->pd_lower <= page->pd_upper && ! page->pd_upper <= page->pd_special && ! page->pd_special <= BLCKSZ && ! page->pd_special == MAXALIGN(page->pd_special)) ! return true; /* Check all-zeroes case */ pagebytes = (char *) page; --- 78,104 ---- * will clean up such a page and make it usable. */ bool ! PageIsVerified(Page page, BlockNumber blkno) { + PageHeader p = (PageHeader) page; char *pagebytes; int i; ! /* ! * Don't verify page data unless the page passes basic non-zero test ! */ ! if (!PageIsNew(page)) ! { ! /* Check normal case */ ! if (PageChecksumOK(page, blkno) && ! (p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && ! (p->pd_flags & PD_HEADERCHECK) == 0 && ! p->pd_lower <= p->pd_upper && ! p->pd_upper <= p->pd_special && ! p->pd_special <= BLCKSZ && ! p->pd_special == MAXALIGN(p->pd_special)) ! return true; ! } /* Check all-zeroes case */ pagebytes = (char *) page; *************** *** 827,829 **** PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) --- 844,1074 ---- pfree(itemidbase); } + + /* + * Test whether the page checksum is correct or not. + * + * IMPORTANT NOTE - + * The checksum is not valid at all times on a data page. We set it before we + * flush page/buffer, and implicitly invalidate the checksum when we modify the + * page. A heavily accessed buffer might then spend most of its life with an + * invalid page checksum, so testing random pages in the buffer pool will tell + * you nothing. The reason for this is that the checksum detects otherwise + * silent errors caused by the filesystems on which we rely. We do not protect + * buffers against uncorrectable memory errors, since these have a very low + * measured incidence according to research on large server farms, + * http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22. + * + * That means that WAL-logged changes to a page do NOT update the page + * checksum, so full page images may not have a valid checksum. But those page + * images have the WAL CRC covering them and so are verified separately from + * this mechanism. + * + * Any write of a data block can cause a torn page if the write is unsuccessful. + * Full page writes protect us from that, which are stored in WAL. Setting + * hint bits when a page is already dirty is OK because a full page write + * must already have been written for that since the last checkpoint. + * Setting hint bits on an otherwise clean page can allow torn pages; this + * doesn't normally matter since they are just hints. When the page has + * checksums, losing a few bits would cause the checksum to be invalid. + * So if we have full_page_writes = on and checksums enabled then we must + * write a WAL record specifically so that we record a full page image in WAL. + * New WAL records cannot be written during recovery, so hint bits set + * during recovery must not dirty the page if the buffer is not already dirty, + * when checksums are enabled. + * + * WAL replay ignores page checksums unless it writes out or reads in blocks + * from disk; restoring full page images does not verify checksums via this + * function. + * + * The best way to understand this is that WAL CRCs protect records entering + * the WAL stream, and page verification protects blocks entering the shared + * buffer pool. They are similar in purpose, yet completely separate. + * Together they ensure we are able to detect errors in data re-entering + * PostgreSQL-controlled memory. Note also that the WAL checksum is a + * 32-bit CRC, whereas the page checksum is a Fletcher checksum, not a CRC. + * + * This function returns a boolean, not a full damage assessment. + */ + static bool + PageChecksumOK(Page page, BlockNumber blkno) + { + PageHeader p = (PageHeader) page; + uint16 checksum; + uint16 checksum_mask = PD_CHECKSUMS1 | PD_CHECKSUMS2; + + /* Quick exit if nobody cares about checksumming */ + if (!DataChecksumsEnabled()) + { + /* + * We don't verify that the checksum itself is zero here, because pages + * upgraded from previous versions may still hold the TLI in the + * checksum field. + */ + if ((p->pd_flags & checksum_mask) != 0) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("unexpected checksum flags on page"))); + return false; + } + return true; + } + + if ((p->pd_flags & checksum_mask) != checksum_mask) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("checksum flags missing on page"))); + return false; + } + + checksum = PageCalcChecksum16(page, blkno); + + if (checksum != p->pd_checksum) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, p->pd_checksum))); + return false; + } + + return true; + } + + /* + * Set checksum for page in shared buffers. + * + * If checksums are disabled, or if the page is not initialized, just return + * the input. Otherwise, we must make a copy of the page before calculating the + * checksum, to prevent concurrent modifications (e.g. setting hint bits) from + * making the final checksum invalid. + * + * Returns a pointer to the block-sized data that needs to be written. Uses + * statically-allocated memory, so the caller must immediately write the + * returned page and not refer to it again. + */ + char * + PageSetChecksumCopy(Page page, BlockNumber blkno) + { + if (PageIsNew(page) || !DataChecksumsEnabled()) + return (char *) page; + + /* + * We make a copy iff we need to calculate a checksum because other + * backends may set hint bits on this page while we write, which + * would mean the checksum differs from the page contents. It doesn't + * matter if we include or exclude hints during the copy, as long + * as we write a valid page and associated checksum. + */ + memcpy((char *) pageCopy, (char *) page, BLCKSZ); + PageSetChecksumInplace(pageCopy, blkno); + return (char *) pageCopy; + } + + /* + * Set checksum for page in private memory. + * + * This is a simpler version of PageSetChecksumCopy(). The more explicit API + * allows us to more easily see if we're making the correct call and reduces + * the amount of additional code specific to page verification. + */ + void + PageSetChecksumInplace(Page page, BlockNumber blkno) + { + if (PageIsNew(page)) + return; + + if (DataChecksumsEnabled()) + { + PageHeader p = (PageHeader) page; + p->pd_checksum = PageCalcChecksum16(page, blkno); + } + + return; + } + + /* + * Calculate checksum for a PostgreSQL Page. This includes the page number (to + * detect the case when a page is somehow moved to a different location), the + * page header (excluding the checksum itself), and the page data. + * + * The checksum algorithm is a modified Fletcher 64-bit (which is + * order-sensitive). The modification is because, at the end, we have two + * 64-bit sums, but we only have room for a 16-bit checksum. So, instead of + * using a modulus of 2^32 - 1, we use 2^8 - 1; making it also resemble a + * Fletcher 16-bit. We don't use Fletcher 16-bit directly, because processing + * single bytes at a time is slower. + */ + static uint16 + PageCalcChecksum16(Page page, BlockNumber blkno) + { + PageHeaderData header_copy; + uint32 *ptr32Header = (uint32 *) &header_copy; + uint32 *ptr32Page = (uint32 *) page; + int64 sum1 = 0; + int64 sum2 = 0; + uint16 checksum = 0; + uint8 *p8Checksum = (uint8 *) &checksum; + int i; + + /* only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + /* + * Initialize the checksum calculation with the page number. This helps + * catch corruption from whole pages being transposed with other whole + * pages. + */ + sum1 = sum2 = (uint64) blkno; + + /* + * Make a copy of the page header and set the checksum to zero in the + * copy. That allows us to calculate the checksum 32 bits at a time while + * ignoring only the checksum field during calculation. + */ + memcpy(&header_copy, page, SizeOfPageHeaderData); + header_copy.pd_checksum = 0; + + /* compute the checksum of the header */ + for (i = 0; i < SizeOfPageHeaderData / sizeof(uint32); i++) + { + sum1 += ptr32Header[i]; + sum2 += sum1; + } + + /* now checksum the rest of the page */ + for (i = SizeOfPageHeaderData; i < BLCKSZ / sizeof(uint32); i++) + { + sum1 += ptr32Page[i]; + sum2 += sum1; + + /* + * Testing for overflow makes the algorithm slower, but we know that + * overflow won't happen, so only use an Assert. The overflow won't + * happen because sum2 (the larger sum) can grow to a maximum of: + * + * 2^32 * (N^2 - N)/2 + * + * where N is the number of iterations of this loop. The largest block + * size is 32KB, which is 8192 iterations, which yields a number less + * than 2^61, which is still within the range of a signed int64. + */ + Assert(BLCKSZ <= 32768 && sum1 >=0 && sum2 >= 0); + } + + /* + * Store the sums as bytes in the checksum. We add one to shift the range + * from 0..255 to 1..256, to make zero invalid for checksum bytes (which + * seems wise). + */ + p8Checksum[0] = (sum1 % 255) + 1; + p8Checksum[1] = (sum2 % 255) + 1; + + #ifdef DEBUG_CHECKSUM + elog(LOG, "checksum %u", checksum); + #endif + + return checksum; + } *** a/src/backend/utils/time/tqual.c --- b/src/backend/utils/time/tqual.c *************** *** 6,12 **** * NOTE: all the HeapTupleSatisfies routines will update the tuple's * "hint" status bits if we see that the inserting or deleting transaction * has now committed or aborted (and it is safe to set the hint bits). ! * If the hint bits are changed, SetBufferCommitInfoNeedsSave is called on * the passed-in buffer. The caller must hold not only a pin, but at least * shared buffer content lock on the buffer containing the tuple. * --- 6,12 ---- * NOTE: all the HeapTupleSatisfies routines will update the tuple's * "hint" status bits if we see that the inserting or deleting transaction * has now committed or aborted (and it is safe to set the hint bits). ! * If the hint bits are changed, MarkBufferDirtyHint is called on * the passed-in buffer. The caller must hold not only a pin, but at least * shared buffer content lock on the buffer containing the tuple. * *************** *** 121,127 **** SetHintBits(HeapTupleHeader tuple, Buffer buffer, } tuple->t_infomask |= infomask; ! SetBufferCommitInfoNeedsSave(buffer); } /* --- 121,127 ---- } tuple->t_infomask |= infomask; ! MarkBufferDirtyHint(buffer); } /* *** a/src/bin/initdb/initdb.c --- b/src/bin/initdb/initdb.c *************** *** 120,125 **** static bool noclean = false; --- 120,126 ---- static bool do_sync = true; static bool sync_only = false; static bool show_setting = false; + static bool data_checksums = false; static char *xlog_dir = ""; *************** *** 1471,1478 **** bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), ! "\"%s\" --boot -x1 %s %s", ! backend_exec, boot_options, talkargs); PG_CMD_OPEN; --- 1472,1481 ---- unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), ! "\"%s\" --boot -x1 %s %s %s", ! backend_exec, ! data_checksums ? "-k" : "", ! boot_options, talkargs); PG_CMD_OPEN; *************** *** 2778,2783 **** usage(const char *progname) --- 2781,2787 ---- printf(_(" -X, --xlogdir=XLOGDIR location for the transaction log directory\n")); printf(_("\nLess commonly used options:\n")); printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -k, --data-checksums data page checksums\n")); printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --noclean do not clean up after errors\n")); printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n")); *************** *** 3426,3431 **** main(int argc, char *argv[]) --- 3430,3436 ---- {"nosync", no_argument, NULL, 'N'}, {"sync-only", no_argument, NULL, 'S'}, {"xlogdir", required_argument, NULL, 'X'}, + {"data-checksums", no_argument, NULL, 'k'}, {NULL, 0, NULL, 0} }; *************** *** 3457,3463 **** main(int argc, char *argv[]) /* process command-line options */ ! while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sST:X:", long_options, &option_index)) != -1) { switch (c) { --- 3462,3468 ---- /* process command-line options */ ! while ((c = getopt_long(argc, argv, "dD:E:kL:nNU:WA:sST:X:", long_options, &option_index)) != -1) { switch (c) { *************** *** 3506,3511 **** main(int argc, char *argv[]) --- 3511,3519 ---- case 'S': sync_only = true; break; + case 'k': + data_checksums = true; + break; case 'L': share_path = pg_strdup(optarg); break; *** a/src/bin/pg_controldata/pg_controldata.c --- b/src/bin/pg_controldata/pg_controldata.c *************** *** 282,286 **** main(int argc, char *argv[]) --- 282,288 ---- (ControlFile.float4ByVal ? _("by value") : _("by reference"))); printf(_("Float8 argument passing: %s\n"), (ControlFile.float8ByVal ? _("by value") : _("by reference"))); + printf(_("Data page checksums: %s\n"), + (ControlFile.data_checksums ? _("enabled") : _("disabled"))); return 0; } *** a/src/bin/pg_resetxlog/pg_resetxlog.c --- b/src/bin/pg_resetxlog/pg_resetxlog.c *************** *** 618,623 **** PrintControlValues(bool guessed) --- 618,625 ---- (ControlFile.float4ByVal ? _("by value") : _("by reference"))); printf(_("Float8 argument passing: %s\n"), (ControlFile.float8ByVal ? _("by value") : _("by reference"))); + printf(_("Data page checksums: %s\n"), + (ControlFile.data_checksums ? _("enabled") : _("disabled"))); } *** a/src/include/access/heapam_xlog.h --- b/src/include/access/heapam_xlog.h *************** *** 279,285 **** extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, MultiXactId cutoff_multi, OffsetNumber *offsets, int offcnt); ! extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, Page page); --- 279,285 ---- extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, MultiXactId cutoff_multi, OffsetNumber *offsets, int offcnt); ! extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, Page page); *** a/src/include/access/visibilitymap.h --- b/src/include/access/visibilitymap.h *************** *** 24,31 **** extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); ! extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, ! XLogRecPtr recptr, Buffer vmbuf, TransactionId cutoff_xid); extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern BlockNumber visibilitymap_count(Relation rel); extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); --- 24,31 ---- extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); ! extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, ! XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid); extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern BlockNumber visibilitymap_count(Relation rel); extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); *** a/src/include/access/xlog.h --- b/src/include/access/xlog.h *************** *** 267,272 **** extern bool XLogNeedsFlush(XLogRecPtr RecPtr); --- 267,274 ---- extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); extern int XLogFileOpen(XLogSegNo segno); + extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer); + extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern void XLogSetAsyncXactLSN(XLogRecPtr record); *************** *** 294,299 **** extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno); --- 296,302 ---- extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); + extern bool DataChecksumsEnabled(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); extern void BootStrapXLOG(void); *** a/src/include/catalog/pg_control.h --- b/src/include/catalog/pg_control.h *************** *** 63,69 **** typedef struct CheckPoint #define XLOG_BACKUP_END 0x50 #define XLOG_PARAMETER_CHANGE 0x60 #define XLOG_RESTORE_POINT 0x70 ! #define XLOG_FPW_CHANGE 0x80 /* --- 63,70 ---- #define XLOG_BACKUP_END 0x50 #define XLOG_PARAMETER_CHANGE 0x60 #define XLOG_RESTORE_POINT 0x70 ! #define XLOG_FPW_CHANGE 0x80 ! #define XLOG_HINT 0x90 /* *************** *** 207,212 **** typedef struct ControlFileData --- 208,216 ---- bool float4ByVal; /* float4 pass-by-value? */ bool float8ByVal; /* float8, int8, etc pass-by-value? */ + /* Are data pages protected by checksums? */ + bool data_checksums; + /* CRC of all above ... MUST BE LAST! */ pg_crc32 crc; } ControlFileData; *** a/src/include/storage/bufmgr.h --- b/src/include/storage/bufmgr.h *************** *** 203,209 **** extern Size BufferShmemSize(void); extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum); ! extern void SetBufferCommitInfoNeedsSave(Buffer buffer); extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); --- 203,209 ---- extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum); ! extern void MarkBufferDirtyHint(Buffer buffer); extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); *** a/src/include/storage/bufpage.h --- b/src/include/storage/bufpage.h *************** *** 15,20 **** --- 15,21 ---- #define BUFPAGE_H #include "access/xlogdefs.h" + #include "storage/block.h" #include "storage/item.h" #include "storage/off.h" *************** *** 163,176 **** typedef PageHeaderData *PageHeader; * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ ! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. --- 164,196 ---- * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. + * + * PD_CHECKSUMS1 and PD_CHECKSUMS2 indicate the presence of checksums. This + * allows future support for enabling/disabling the use of checksums while the + * system is online. There is some concern that trusting page data to say how + * to check page data is dangerously self-referential. To avoid falsely + * determining that the page has no checksum, we set two non-adjacent bits to + * signify that the page has a checksum and should be verified when that block + * is read back into a buffer. We use two bits in case a multiple bit error + * removes one of the checksum flags *and* destroys data, which would lead to + * skipping the checksum check and silently accepting bad data. We also require + * that a third bit (PD_HEADERCHECK) is zeroed regardless of the presence of a + * checksum. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ + #define PD_CHECKSUMS1 0x0008 /* bit indicating the presence of + * checksums */ + #define PD_HEADERCHECK 0x0010 /* always zero -- if set, indicates + * corruption */ + + #define PD_CHECKSUMS2 0x8000 /* bit indicating the presence of + * checksums */ ! #define PD_VALID_FLAG_BITS 0x801F /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. *************** *** 378,384 **** do { \ */ extern void PageInit(Page page, Size pageSize, Size specialSize); ! extern bool PageHeaderIsValid(PageHeader page); extern OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap); extern Page PageGetTempPage(Page page); --- 398,404 ---- */ extern void PageInit(Page page, Size pageSize, Size specialSize); ! extern bool PageIsVerified(Page page, BlockNumber blkno); extern OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap); extern Page PageGetTempPage(Page page); *************** *** 391,395 **** extern Size PageGetExactFreeSpace(Page page); --- 411,417 ---- extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); + extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); + extern void PageSetChecksumInplace(Page page, BlockNumber blkno); #endif /* BUFPAGE_H */