From ae5b957ae33648afda6e936801d8cc23d7469954 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 13 Jun 2014 15:54:41 +0900 Subject: [PATCH] Buffer capture facility: check WAL replay consistency It is a tool aimed to be used by developers and buildfarm machines that can be used to check for consistency at page level when replaying WAL files among several nodes of a cluster (generally master and standby node). This facility is made of two parts: - A server part, where all the changes happening at page level are captured and inserted in a file called buffer_captures located at the root of PGDATA. Each buffer entry is masked to make the comparison across node consistent (flags like hint bits for example) and then each buffer is captured is with the following format as a single line of the output file: LSN: %08X/%08X page: PAGE_IN_HEXA Hexadecimal format makes it easier to detect differences between pages, and format is chosen to facilitate comparison between buffer entries. - A client part, located in contrib/buffer_capture_cmp, that can be used to compare buffer captures between nodes. The footprint on core code is minimal and is controlled by a CFLAGS called BUFFER_CAPTURE that needs to be set at build time to enable the buffer capture at server level. If this symbol is not enabled, both server and client parts are idle and generate nothing. Note that this facility can generate a lot of output (11G when running regression tests, counting double when using both master and standby). contrib/buffer_capture_cmp contains a regression test facility easing testing with buffer captures. The user just needs to run "make check" in this folder... There is a default set of tests saved in test-default.sh but user is free to set up custom tests by creating a file called test-custom.sh that can be kicked by the test facility if this file is present instead of the defaults. --- contrib/Makefile | 1 + contrib/buffer_capture_cmp/.gitignore | 9 + contrib/buffer_capture_cmp/Makefile | 32 ++ contrib/buffer_capture_cmp/README | 34 ++ contrib/buffer_capture_cmp/buffer_capture_cmp.c | 346 +++++++++++++++++ contrib/buffer_capture_cmp/test-default.sh | 14 + contrib/buffer_capture_cmp/test.sh | 203 ++++++++++ src/backend/access/heap/heapam.c | 11 + src/backend/commands/sequence.c | 5 - src/backend/storage/buffer/Makefile | 2 +- src/backend/storage/buffer/bufcapt.c | 482 ++++++++++++++++++++++++ src/backend/storage/buffer/bufmgr.c | 40 +- src/backend/storage/lmgr/lwlock.c | 8 + src/backend/storage/page/bufpage.c | 3 + src/include/commands/sequence.h | 5 + src/include/miscadmin.h | 13 + src/include/storage/bufcapt.h | 65 ++++ 17 files changed, 1266 insertions(+), 7 deletions(-) create mode 100644 contrib/buffer_capture_cmp/.gitignore create mode 100644 contrib/buffer_capture_cmp/Makefile create mode 100644 contrib/buffer_capture_cmp/README create mode 100644 contrib/buffer_capture_cmp/buffer_capture_cmp.c create mode 100644 contrib/buffer_capture_cmp/test-default.sh create mode 100644 contrib/buffer_capture_cmp/test.sh create mode 100644 src/backend/storage/buffer/bufcapt.c create mode 100644 src/include/storage/bufcapt.h diff --git a/contrib/Makefile b/contrib/Makefile index b37d0dd..1c8e6b9 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -10,6 +10,7 @@ SUBDIRS = \ auto_explain \ btree_gin \ btree_gist \ + buffer_capture_cmp \ chkpass \ citext \ cube \ diff --git a/contrib/buffer_capture_cmp/.gitignore b/contrib/buffer_capture_cmp/.gitignore new file mode 100644 index 0000000..ecd8b78 --- /dev/null +++ b/contrib/buffer_capture_cmp/.gitignore @@ -0,0 +1,9 @@ +# Binary generated +/buffer_capture_cmp + +# Regression tests +/capture_differences.txt +/tmp_check + +# Custom test file +/test-custom.sh diff --git a/contrib/buffer_capture_cmp/Makefile b/contrib/buffer_capture_cmp/Makefile new file mode 100644 index 0000000..da4316b --- /dev/null +++ b/contrib/buffer_capture_cmp/Makefile @@ -0,0 +1,32 @@ +# contrib/buffer_capture_cmp/Makefile + +PGFILEDESC = "buffer_capture_cmp - Comparator tool between buffer captures" +PGAPPICON = win32 + +PROGRAM = buffer_capture_cmp +OBJS = buffer_capture_cmp.o + +PG_CPPFLAGS = -I$(libpq_srcdir) -DFRONTEND +PG_LIBS = $(libpq_pgport) + +EXTRA_CLEAN = tmp_check/ capture_differences.txt + +# test.sh creates a cluster dedicated for the test +EXTRA_REGRESS_OPTS=--use-existing + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/buffer_capture_cmp +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# Tests can be done only when BUFFER_CAPTURE is defined +ifneq (,$(filter -DBUFFER_CAPTURE,$(CFLAGS))) +check: test.sh all + MAKE=$(MAKE) bindir=$(bindir) EXTRA_REGRESS_OPTS="$(EXTRA_REGRESS_OPTS)" $(SHELL) ./test.sh +endif diff --git a/contrib/buffer_capture_cmp/README b/contrib/buffer_capture_cmp/README new file mode 100644 index 0000000..1039c69 --- /dev/null +++ b/contrib/buffer_capture_cmp/README @@ -0,0 +1,34 @@ +buffer_capture_cmp +------------------ + +This facility contains already contains a set of regression tests that +can run be by default. This simple command is enough to run the tests: + + make check + +The code contains a hook that can be used as an entry point to run some +custom tests using this facility. Simply create in this folder a file +called test-custom.sh and execute all the commands necessary for the +tests. This script can use the node number of the master node available +as the first argument of the script when it is run within the test +suite. + +Tips +---- + +The page images take up a lot of disk space! The PostgreSQL regression +suite generates about 11GB - double that when the same is generated also +in a standby. + +Always stop the master first, then standby. Otherwise, when you restart +the standby, it will start WAL replay from the previous checkpoint, and +log some page images already. Stopping the master creates a checkpoint +record, avoiding the problem. + +It is possible to use pg_xlogdump to see which WAL record a page image +corresponds to. But beware that the LSN in the page image points to the +*end* of the WAL record, while the LSN that pg_xlogdump prints is the +*beginning* of the WAL record. So to find which WAL record a page image +corresponds to, find the LSN from the page image in pg_xlogdump output, +and back off one record. (you can't just grep for the line containing +the LSN). diff --git a/contrib/buffer_capture_cmp/buffer_capture_cmp.c b/contrib/buffer_capture_cmp/buffer_capture_cmp.c new file mode 100644 index 0000000..2695597 --- /dev/null +++ b/contrib/buffer_capture_cmp/buffer_capture_cmp.c @@ -0,0 +1,346 @@ +/*------------------------------------------------------------------------- + * + * buffer_capture_cmp.c + * Utility tool to compare buffer captures between two nodes of + * a cluster. This utility needs to be run on servers whose code + * has been built with the symbol BUFFER_CAPTURE defined. + * + * Copyright (c) 2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/buffer_capture_cmp/buffer_capture_cmp.c + * + * Capture files that can be obtained on nodes of a cluster do not + * necessarily have the page images logged in the same order. For + * example, if a single WAL-logged operation modifies multiple pages, + * like an index page split, the standby might release the locks + * in different order than the master. Another cause is concurrent + * operations; writing the page images is not atomic with WAL insertion, + * so if two backends are running concurrently, their modifications in + * the image log can be interleaved in different order than their WAL + * records. + * + * To fix that, the lines from the capture file are read into a reorder + * buffer, and sorted there. Sorting the whole file would be overkill, + * as the lines are mostly in order already. The fixed-size reorder + * buffer works as long as the lines are not out-of-order by more than + * REORDER_BUFFER_SIZE lines. + * + * If symbol BUFFER_CAPTURE is not defined, this utility does nothing. + */ + +#include "postgres_fe.h" +#include "port.h" +#include "storage/bufcapt.h" + +#ifdef BUFFER_CAPTURE + +/* Size of a single entry of the capture file */ +#define LINESZ (BLCKSZ*2 + 31) + +/* Line reordering stuff */ +#define REORDER_BUFFER_SIZE 1000 + +typedef struct +{ + char *lines[REORDER_BUFFER_SIZE]; + int nlines; /* Number of lines currently in buffer */ + + FILE *fp; + bool eof; /* Has EOF been reached from this source? */ +} reorder_buffer; + +/* + * Read lines from the capture file into the reorder buffer, until the + * buffer is full. + */ +static void +fill_reorder_buffer(reorder_buffer *buf) +{ + if (buf->eof) + return; + + while (buf->nlines < REORDER_BUFFER_SIZE) + { + char *linebuf = pg_malloc(LINESZ); + + if (fgets(linebuf, LINESZ, buf->fp) == NULL) + { + buf->eof = true; + pg_free(linebuf); + break; + } + + /* + * Common case, entry goes to the end. This happens for an + * initialization or when buffer compares to be higher than + * the last buffer in queue. + */ + if (buf->nlines == 0 || + strcmp(linebuf, buf->lines[buf->nlines - 1]) >= 0) + { + buf->lines[buf->nlines] = linebuf; + } + else + { + /* Find the right place in the queue */ + int i; + + /* + * Scan all the existing buffers and find where buffer needs + * to be included. We already know that the comparison result + * we the last buffer in list. + */ + for (i = buf->nlines - 1; i >= 0; i--) + { + if (strcmp(linebuf, buf->lines[i]) >= 0) + break; + } + + /* Place buffer correctly in the list */ + i++; + memmove(&buf->lines[i + 1], &buf->lines[i], + (buf->nlines - i) * sizeof(char *)); + buf->lines[i] = linebuf; + } + buf->nlines++; + } +} + +/* + * Initialize a reorder buffer. + */ +static reorder_buffer * +init_reorder_buffer(FILE *fp) +{ + reorder_buffer *buf; + + buf = pg_malloc(sizeof(reorder_buffer)); + buf->fp = fp; + buf->eof = false; + buf->nlines = 0; + + fill_reorder_buffer(buf); + + return buf; +} + +/* + * Read all the lines that belong to the next WAL record from the reorder + * buffer. + */ +static int +readrecord(reorder_buffer *buf, char **lines, uint64 *lsn) +{ + int nlines; + uint32 line_xlogid; + uint32 line_xrecoff; + uint64 line_lsn; + uint64 rec_lsn; + + /* Get all the lines with the same LSN */ + for (nlines = 0; nlines < buf->nlines; nlines++) + { + /* Fetch LSN from the first 8 bytes of the buffer */ + sscanf(buf->lines[nlines], "LSN: %08X/%08X", + &line_xlogid, &line_xrecoff); + line_lsn = ((uint64) line_xlogid) << 32 | (uint64) line_xrecoff; + + if (nlines == 0) + *lsn = rec_lsn = line_lsn; + else + { + if (line_lsn != rec_lsn) + break; + } + } + + if (nlines == buf->nlines) + { + if (!buf->eof) + { + fprintf(stderr, "max number of lines in record reached, LSN: %X/%08X\n", + line_xlogid, line_xrecoff); + exit(1); + } + } + + /* Consume the lines from the reorder buffer */ + memcpy(lines, buf->lines, sizeof(char *) * nlines); + memmove(&buf->lines[0], &buf->lines[nlines], + sizeof(char *) * (buf->nlines - nlines)); + buf->nlines -= nlines; + + fill_reorder_buffer(buf); + return nlines; +} + +/* + * Free all the given records. + */ +static void +freerecord(char **lines, int nlines) +{ + int i; + + for (i = 0; i < nlines; i++) + pg_free(lines[i]); +} + +/* + * Print out given records. + */ +static void +printrecord(char **lines, int nlines) +{ + int i; + + for (i = 0; i < nlines; i++) + printf("%s", lines[i]); +} + +/* + * Do a direct comparison between the two given records that have the + * same LSN entry. + */ +static bool +diffrecord(char **lines_a, int nlines_a, char **lines_b, int nlines_b) +{ + int i; + + /* Leave if they do not have the same number of records */ + if (nlines_a != nlines_b) + return true; + + /* + * Now do a comparison line by line. If any diffs are found at + * character-level they will be printed out. + */ + for (i = 0; i < nlines_a; i++) + { + if (strcmp(lines_a[i], lines_b[i]) != 0) + { + int strlen_a = strlen(lines_a[i]); + int strlen_b = strlen(lines_b[i]); + int j; + + printf("strlen_a: %d, strlen_b: %d\n", + strlen_a, strlen_b); + for (j = 0; j < strlen_a; j++) + { + char char_a = lines_a[i][j]; + char char_b = lines_b[i][j]; + if (char_a != char_b) + printf("position: %d, char_a: %c, char_b: %c\n", + j, char_a, char_b); + } + return true; + } + } + + return false; +} + +static void +usage(void) +{ + printf("usage: postprocess-images \n"); +} + +int +main(int argc, char **argv) +{ + char *lines_a[REORDER_BUFFER_SIZE]; + int nlines_a; + char *lines_b[REORDER_BUFFER_SIZE]; + int nlines_b; + char *path_a, *path_b; + FILE *fp_a; + FILE *fp_b; + uint64 lsn_a; + uint64 lsn_b; + reorder_buffer *buf_a; + reorder_buffer *buf_b; + + if (argc != 3) + { + usage(); + exit(1); + } + + /* Open first file */ + path_a = pg_strdup(argv[1]); + canonicalize_path(path_a); + path_a = psprintf("%s/%s", path_a, BUFFER_CAPTURE_FILE); + fp_a = fopen(path_a, "rb"); + if (fp_a == NULL) + { + fprintf(stderr, "Could not open file \"%s\"\n", path_a); + fprintf(stderr, "Check if server binaries are built with symbol BUFFER_CAPTURE"); + exit(2); + } + + /* Open second file */ + path_b = pg_strdup(argv[2]); + canonicalize_path(path_b); + path_b = psprintf("%s/%s", path_b, BUFFER_CAPTURE_FILE); + fp_b = fopen(path_b, "rb"); + if (fp_b == NULL) + { + fprintf(stderr, "Could not open file \"%s\"\n", path_b); + fprintf(stderr, "Check if server binaries are built with symbol BUFFER_CAPTURE"); + exit(2); + } + + /* Initialize buffers for first loop */ + buf_a = init_reorder_buffer(fp_a); + buf_b = init_reorder_buffer(fp_b); + + /* Read first record from both */ + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + + /* Do comparisons as long as there are entries */ + while (nlines_a > 0 || nlines_b > 0) + { + /* Compare the records */ + if (lsn_a < lsn_b || nlines_b == 0) + { + printf("Only in A:\n"); + printrecord(lines_a, nlines_a); + freerecord(lines_a, nlines_a); + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + } + else if (lsn_a > lsn_b || nlines_a == 0) + { + printf("Only in B:\n"); + printrecord(lines_b, nlines_b); + freerecord(lines_b, nlines_b); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + } + else if (lsn_a == lsn_b) + { + if (diffrecord(lines_a, nlines_a, lines_b, nlines_b)) + { + printf("Lines differ, A:\n"); + printrecord(lines_a, nlines_a); + printf("B:\n"); + printrecord(lines_b, nlines_b); + } + freerecord(lines_a, nlines_a); + freerecord(lines_b, nlines_b); + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + } + } + + return 0; +} + +#else +int +main(int argc, char **argv) +{ + return 0; +} +#endif /* BUFFER_CAPTURE */ diff --git a/contrib/buffer_capture_cmp/test-default.sh b/contrib/buffer_capture_cmp/test-default.sh new file mode 100644 index 0000000..5bec503 --- /dev/null +++ b/contrib/buffer_capture_cmp/test-default.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Default test suite for buffer_compare_cmp + +# PGPORT is already set so process should refer to that when kicking tests + +# In order to run the regression test suite, copy this file as test-custom.sh +# and then uncomment the following lines: +# echo ROOT_DIR=$PWD +# psql -c 'CREATE DATABASE regression' +# cd ../../src/test/regress && make installcheck 2>&1 > /dev/null +# cd $ROOT_DIR + +# Create a simple table +psql -c 'CREATE TABLE aa AS SELECT generate_series(1, 10) AS a' diff --git a/contrib/buffer_capture_cmp/test.sh b/contrib/buffer_capture_cmp/test.sh new file mode 100644 index 0000000..8a28f0a --- /dev/null +++ b/contrib/buffer_capture_cmp/test.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +# contrib/buffer_capture_cmp/test.sh +# +# Test driver for buffer_capture_cmp. It does the following processing: +# +# 1) Initialization of a master and a standby +# 2) Stop master, then standby +# 3) Remove $PGDATA/buffer_capture on master and standby +# 4) Start master, then standby +# 5) Run custom or default series of tests +# 6) Stop master, then standby +# 7) Compare the buffer capture of both nodes +# 8) The diffence file should be empty +# +# Portions Copyright (c) 2006-2014, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California + +# Leave immediately in case of an error +set -e + +: ${MAKE=make} + +# Guard against parallel make issues (see comments in pg_regress.c) +unset MAKEFLAGS +unset MAKELEVEL + +# Set listen_addresses desirably +testhost=`uname -s` +case $testhost in + MINGW*) LISTEN_ADDRESSES="localhost" ;; + *) LISTEN_ADDRESSES="" ;; +esac + +# Adjust these paths for your environment +TESTROOT=$PWD/tmp_check +TEST_MASTER=$TESTROOT/data_master +TEST_STANDBY=$TESTROOT/data_standby + +# Create the root folder for test data +if [ ! -d $TESTROOT ]; then + mkdir -p $TESTROOT +fi + +# Clear out any environment vars that might cause libpq to connect to +# the wrong postmaster (cf pg_regress.c) +# +# Some shells, such as NetBSD's, return non-zero from unset if the variable +# is already unset. Since we are operating under 'set -e', this causes the +# script to fail. To guard against this, set them all to an empty string first. +PGDATABASE=""; unset PGDATABASE +PGUSER=""; unset PGUSER +PGSERVICE=""; unset PGSERVICE +PGSSLMODE=""; unset PGSSLMODE +PGREQUIRESSL=""; unset PGREQUIRESSL +PGCONNECT_TIMEOUT=""; unset PGCONNECT_TIMEOUT +PGHOST=""; unset PGHOST +PGHOSTADDR=""; unset PGHOSTADDR + +export PGDATABASE="postgres" + +# Set up PATH correctly +PATH=$bindir:$PATH +export PATH + +newsrc=`cd ../.. && pwd` +# Calculate port to use as a base for calculations +PG_VERSION_NUM=`grep '#define PG_VERSION_NUM' $newsrc/src/include/pg_config.h | awk '{print $3}'` +PG_BASE_PORT=`expr $PG_VERSION_NUM % 16384 + 49152` +# Get free port for master node +PG_MASTER_PORT=$PG_BASE_PORT +while psql -X postgres -p $PG_MASTER_PORT /dev/null +do + i=`expr $i + 1` + if [ $i -eq 16 ] + then + echo port $PG_MASTER_PORT apparently in use + exit 1 + fi + PG_MASTER_PORT=`expr $PG_MASTER_PORT + 1` +done +export PGPORT=$PG_MASTER_PORT + +# Get free port for standby node +PG_STANDBY_PORT=`expr $PG_MASTER_PORT + 1` +while psql -X postgres -p $PG_STANDBY_PORT /dev/null +do + i=`expr $i + 1` + if [ $i -eq 16 ] + then + echo port $PG_STANDBY_PORT apparently in use + exit 1 + fi + PG_STANDBY_PORT=`expr $PG_STANDBY_PORT + 1` +done + +# Enable echo so the user can see what is being executed +set -x + +# Set up the nodes, first the master +rm -rf $TEST_MASTER +initdb -N -A trust -D $TEST_MASTER + +# Custom parameters for master's postgresql.conf +cat >> $TEST_MASTER/postgresql.conf <> $TEST_MASTER/pg_hba.conf <> $TEST_STANDBY/postgresql.conf + +cat > $TEST_STANDBY/recovery.conf < $DIFF_FILE +ERR_NUM=$? + +# Cover the case where capture file does not exist +if [ $ERR_NUM == 2 ]; then + echo "Capture file does not exist" + echo "PASSED" + exit 0 +elif [ $ERR_NUM == 1 ]; then + echo "FAILED" + exit 1 +fi + +# No need to echo commands anymore +set +x +echo + +# Test passes if there are no diffs! +if [ ! -s $DIFF_FILE ]; then + echo "PASSED" + exit 0 +else + echo "Diffs exist in the buffer captures" + echo "FAILED" + exit 1 +fi diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b77c32c..8f5a450 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -71,6 +71,9 @@ #include "utils/syscache.h" #include "utils/tqual.h" +#ifdef BUFFER_CAPTURE +#include "storage/bufcapt.h" +#endif /* GUC variable */ bool synchronize_seqscans = true; @@ -7002,6 +7005,14 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, END_CRIT_SECTION(); +#ifdef BUFFER_CAPTURE + /* + * The normal mechanism that hooks into LockBuffer doesn't work for this, + * because we're bypassing buffer manager. + */ + buffer_capture_write(page, blkno); +#endif + return recptr; } diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index e608420..2134eae 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -45,11 +45,6 @@ */ #define SEQ_LOG_VALS 32 -/* - * The "special area" of a sequence's buffer page looks like this. - */ -#define SEQ_MAGIC 0x1717 - typedef struct sequence_magic { uint32 magic; diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index 2c10fba..6ec85b0 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o +OBJS = buf_table.o buf_init.o bufcapt.o bufmgr.o freelist.o localbuf.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/bufcapt.c b/src/backend/storage/buffer/bufcapt.c new file mode 100644 index 0000000..9e5dfc1 --- /dev/null +++ b/src/backend/storage/buffer/bufcapt.c @@ -0,0 +1,482 @@ +/*------------------------------------------------------------------------- + * + * bufcapt.c + * Routines for buffer capture, including masking and dynamic buffer + * snapshot. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/bufcapt.c + * + *------------------------------------------------------------------------- + */ + +#ifdef BUFFER_CAPTURE + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/gist.h" +#include "access/gin_private.h" +#include "access/htup_details.h" +#include "access/spgist_private.h" +#include "commands/sequence.h" +#include "storage/bufcapt.h" +#include "storage/bufmgr.h" +#include "storage/lwlock.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0xFF + +/* Support for capturing changes to pages per process */ +#define MAX_BEFORE_IMAGES 100 + +typedef struct +{ + Buffer buffer; + char content[BLCKSZ]; +} BufferImage; + +static BufferImage *before_images[MAX_BEFORE_IMAGES]; +static FILE *imagefp; +static int before_images_cnt = 0; + +/* ---------------------------------------------------------------- + * Masking functions. + * + * Most pages cannot be compared directly, because some parts of the + * page are not expected to be byte-by-byte identical. For example, + * hint bits or unused space in the page. The strategy is to normalize + * all pages by creating a mask of those bits that are not expected to + * match. + */ +static void mask_unused_space(char *page); +static void mask_heap_page(char *page); +static void mask_spgist_page(char *page); +static void mask_gist_page(char *page); +static void mask_gin_page(BlockNumber blkno, char *page); +static void mask_sequence_page(char *page); +static void mask_btree_page(char *page); + +/* + * Mask the unused space of a page between pd_lower and pd_upper. + */ +static void +mask_unused_space(char *page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page at %X/%08X\n", + ((PageHeader) page)->pd_lsn.xlogid, + ((PageHeader) page)->pd_lsn.xrecoff); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * Mask a heap page + */ +static void +mask_heap_page(char *page) +{ + OffsetNumber off; + PageHeader phdr = (PageHeader) page; + + mask_unused_space(page); + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = 0xFFFFFFFF; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */ + phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES; + + /* + * Also mask the all-visible flag. + * + * XXX: It is unfortunate that we have to do this. If the flag is set + * incorrectly, that's serious, and we would like to catch it. If the flag + * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN + * records don't currently set the flag, even though it is set in the + * master, so we must silence failures that that causes. + */ + phdr->pd_flags |= PD_ALL_VISIBLE; + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + /* + * Ignore hint bits and command ID. + */ + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + page_htup->t_infomask = + HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | + HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID; + page_htup->t_infomask |= HEAP_COMBOCID; + page_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF; + } + + /* + * Ignore any padding bytes after the tuple, when the length of + * the item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + +/* + * Mask a SpGist page + */ +static void +mask_spgist_page(char *page) +{ + mask_unused_space(page); +} + +/* + * Mask a GIST page + */ +static void +mask_gist_page(char *page) +{ + mask_unused_space(page); +} + +/* + * Mask a Gin page + */ +static void +mask_gin_page(BlockNumber blkno, char *page) +{ + /* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */ + if (blkno != 0) + mask_unused_space(page); +} + +/* + * Mask a sequence page + */ +static void +mask_sequence_page(char *page) +{ + /* + * FIXME: currently, we just ignore sequence records altogether. nextval + * records a different value in the WAL record than it writes to the + * buffer. Ideally we would only mask out the value in the tuple. + */ + memset(page, MASK_MARKER, BLCKSZ); +} + +/* + * Mask a btree page + */ +static void +mask_btree_page(char *page) +{ + OffsetNumber off; + OffsetNumber maxoff; + BTPageOpaque maskopaq = (BTPageOpaque) + (((char *) page) + ((PageHeader) page)->pd_special); + + /* + * Mark unused space before any processing. This is important as it + * uses pd_lower and pd_upper that may be masked on this page + * afterwards if it is a deleted page. + */ + mask_unused_space(page); + + /* + * Mask everything on a DELETED page. + */ + if (((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags & BTP_DELETED) + { + /* Page content, between standard page header and opaque struct */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))); + + /* pd_lower and upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, sizeof(uint16)); + } + else + { + /* + * Mask some line pointer bits, particularly those marked as + * used on a master and unused on a standby. + * XXX: This could be better with more refinement. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = 1; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + + if (ItemIdIsUsed(iid)) + iid->lp_flags = LP_UNUSED; + } + } + + /* + * Mask BTP_HAS_GARBAGE flag. This needs to be done at the end + * of process as previous masking operations could generate some + * garbage. + */ + maskopaq->btpo_flags |= BTP_HAS_GARBAGE; +} + +/* ---------------------------------------------------------------- + * Buffer capture functions. + * + * Those functions can be used to memmorize the content of pages + * and flush them to BUFFER_CAPTURE_FILE when necessary. + */ +static bool +buffer_capture_is_changed(BufferImage *img) +{ + Page newcontent = BufferGetPage(img->buffer); + Page oldcontent = (Page) img->content; + + if (PageGetLSN(oldcontent) == PageGetLSN(newcontent)) + return false; + return true; +} + +/* + * Initialize page capture + */ +void +buffer_capture_init(void) +{ + int i; + BufferImage *images; + + /* Initialize page image capturing */ + images = palloc(MAX_BEFORE_IMAGES * sizeof(BufferImage)); + + for (i = 0; i < MAX_BEFORE_IMAGES; i++) + before_images[i] = &images[i]; + + imagefp = fopen(BUFFER_CAPTURE_FILE, "ab"); +} + +/* + * buffer_capture_reset + * + * Reset buffer captures + */ +void +buffer_capture_reset(void) +{ + if (before_images_cnt > 0) + elog(LOG, "Released all buffer captures"); + before_images_cnt = 0; +} + +/* + * buffer_capture_write + * + * Flush to file the new content page present here after applying a + * mask on it. + */ +void +buffer_capture_write(char *newcontent, + uint32 blkno) +{ + XLogRecPtr newlsn = PageGetLSN((Page) newcontent); + char page[BLCKSZ]; + uint16 tail; + + /* + * We need a lock to make sure that only one backend writes to the file + * at a time. Abuse SyncScanLock for that - it happens to never be used + * while a buffer is locked/unlocked. + */ + LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); + + /* Copy content of page before any operation */ + memcpy(page, newcontent, BLCKSZ); + + /* + * Look at the size of the special area, and the last two bytes in + * it, to detect what kind of a page it is. Then call the appropriate + * masking function. + */ + memcpy(&tail, &page[BLCKSZ - 2], 2); + if (PageGetSpecialSize(page) == 0) + { + mask_heap_page(page); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(BTPageOpaqueData)) || + PageGetSpecialSize(page) == MAXALIGN(sizeof(GISTPageOpaqueData))) + { + /* + * It happens that btree and gist have the same size of special + * area. + */ + if (tail == GIST_PAGE_ID) + mask_gist_page(page); + else if (tail <= MAX_BT_CYCLE_ID) + mask_btree_page(page); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(SpGistPageOpaqueData)) || + PageGetSpecialSize(page) == MAXALIGN(sizeof(GinPageOpaqueData)) || + PageGetSpecialSize(page) == 8) + { + /* + * XXX: Page detection for sequences can be improved. + */ + if (tail == SPGIST_PAGE_ID) + mask_spgist_page(page); + else if (*((uint32 *) (page + BLCKSZ - MAXALIGN(sizeof(uint32)))) == SEQ_MAGIC) + mask_sequence_page(page); + else + mask_gin_page(blkno, page); + } + + /* + * First write the LSN in a constant format to facilitate comparisons + * between buffer captures. + */ + fprintf(imagefp, "LSN: %08X/%08X ", + (uint32) (newlsn >> 32), (uint32) newlsn); + + /* Then write the page contents, in hex */ + fprintf(imagefp, "page: "); + { + char buf[BLCKSZ * 2]; + int j = 0; + int i; + for (i = 0; i < BLCKSZ; i++) + { + const char *digits = "0123456789ABCDEF"; + uint8 byte = (uint8) page[i]; + + buf[j++] = digits[byte >> 4]; + buf[j++] = digits[byte & 0x0F]; + } + fwrite(buf, BLCKSZ * 2, 1, imagefp); + } + fprintf(imagefp, "\n"); + + /* Then the masked page in hex format */ + fflush(imagefp); + + /* Clean up */ + LWLockRelease(SyncScanLock); +} + +/* + * buffer_capture_remember + * + * Append a page content to the existing list of buffers to-be-captured. + */ +void +buffer_capture_remember(Buffer buffer) +{ + BufferImage *img; + + Assert(before_images_cnt < MAX_BEFORE_IMAGES); + + img = before_images[before_images_cnt]; + img->buffer = buffer; + memcpy(img->content, BufferGetPage(buffer), BLCKSZ); + before_images_cnt++; +} + +/* + * buffer_capture_forget + * + * Forget a page image. If the page was modified, log the new contents. + */ +void +buffer_capture_forget(Buffer buffer) +{ + int i; + + for (i = 0; i < before_images_cnt; i++) + { + BufferImage *img = before_images[i]; + + if (img->buffer == buffer) + { + /* If page has new content, capture it */ + if (buffer_capture_is_changed(img)) + { + Page content = BufferGetPage(img->buffer); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + + BufferGetTag(img->buffer, &rnode, &forknum, &blkno); + buffer_capture_write(content, blkno); + } + + if (i != before_images_cnt) + { + /* Swap the last still-used slot with this one */ + before_images[i] = before_images[before_images_cnt - 1]; + before_images[before_images_cnt - 1] = img; + } + + before_images_cnt--; + return; + } + } + elog(LOG, "could not find image for buffer %u", buffer); +} + +/* + * buffer_capture_scan + * + * See if any of the buffers that have been memorized have changed. + */ +void +buffer_capture_scan(void) +{ + int i; + + for (i = 0; i < before_images_cnt; i++) + { + BufferImage *img = before_images[i]; + + /* + * Print the contents of the page if it was changed. Remember the + * new contents as the current image. + */ + if (buffer_capture_is_changed(img)) + { + Page content = BufferGetPage(img->buffer); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + + BufferGetTag(img->buffer, &rnode, &forknum, &blkno); + buffer_capture_write(content, blkno); + memcpy(img->content, BufferGetPage(img->buffer), BLCKSZ); + } + } +} + +#endif /* BUFFER_CAPTURE */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index c070278..b1b2467 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -50,6 +50,9 @@ #include "utils/resowner_private.h" #include "utils/timestamp.h" +#ifdef BUFFER_CAPTURE +#include "storage/bufcapt.h" +#endif /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) @@ -1728,6 +1731,10 @@ AtEOXact_Buffers(bool isCommit) } #endif +#ifdef BUFFER_CAPTURE + buffer_capture_reset(); +#endif + AtEOXact_LocalBuffers(isCommit); } @@ -1744,6 +1751,10 @@ void InitBufferPoolBackend(void) { on_shmem_exit(AtProcExit_Buffers, 0); + +#ifdef BUFFER_CAPTURE + buffer_capture_init(); +#endif } /* @@ -2759,6 +2770,20 @@ LockBuffer(Buffer buffer, int mode) buf = &(BufferDescriptors[buffer - 1]); +#ifdef BUFFER_CAPTURE + if (mode == BUFFER_LOCK_UNLOCK) + { + /* + * XXX: peek into the LWLock struct to see if we're holding it in + * exclusive or shared mode. This is concurrency-safe: if we're holding + * it in exclusive mode, no-one else can release it. If we're holding + * it in shared mode, no-one else can acquire it in exclusive mode. + */ + if (buf->content_lock->exclusive > 0) + buffer_capture_forget(buffer); + } +#endif + if (mode == BUFFER_LOCK_UNLOCK) LWLockRelease(buf->content_lock); else if (mode == BUFFER_LOCK_SHARE) @@ -2767,6 +2792,11 @@ LockBuffer(Buffer buffer, int mode) LWLockAcquire(buf->content_lock, LW_EXCLUSIVE); else elog(ERROR, "unrecognized buffer lock mode: %d", mode); + +#ifdef BUFFER_CAPTURE + if (mode == BUFFER_LOCK_EXCLUSIVE) + buffer_capture_remember(buffer); +#endif } /* @@ -2778,6 +2808,7 @@ bool ConditionalLockBuffer(Buffer buffer) { volatile BufferDesc *buf; + bool res; Assert(BufferIsValid(buffer)); if (BufferIsLocal(buffer)) @@ -2785,7 +2816,14 @@ ConditionalLockBuffer(Buffer buffer) buf = &(BufferDescriptors[buffer - 1]); - return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); + res = LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); + +#ifdef BUFFER_CAPTURE + if (res) + buffer_capture_remember(buffer); +#endif + + return res; } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index d23ac62..32762a6 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -41,6 +41,10 @@ #include "storage/spin.h" #include "utils/memutils.h" +#ifdef BUFFER_CAPTURE +#include "storage/bufcapt.h" +#endif + #ifdef LWLOCK_STATS #include "utils/hsearch.h" #endif @@ -1240,6 +1244,10 @@ LWLockRelease(LWLock *l) void LWLockReleaseAll(void) { +#ifdef BUFFER_CAPTURE + buffer_capture_reset(); +#endif + while (num_held_lwlocks > 0) { HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 6351a9b..6beaa15 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -21,6 +21,9 @@ #include "utils/memdebug.h" #include "utils/memutils.h" +#ifdef BUFFER_CAPTURE +#include "storage/bufcapt.h" +#endif /* GUC variable */ bool ignore_checksum_failure = false; diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 7d8a370..73b0ede 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -18,6 +18,11 @@ #include "nodes/parsenodes.h" #include "storage/relfilenode.h" +/* + * "special area" identifier of a sequence's buffer page + */ +#define SEQ_MAGIC 0x1717 + typedef struct FormData_pg_sequence { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index c2b786e..1ae98f7 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -116,11 +116,24 @@ do { \ #define START_CRIT_SECTION() (CritSectionCount++) +#ifdef BUFFER_CAPTURE +/* in src/backend/storage/buffer/bufcapt.c */ +void buffer_capture_scan(void); + +#define END_CRIT_SECTION() \ +do { \ + Assert(CritSectionCount > 0); \ + CritSectionCount--; \ + if (CritSectionCount == 0) \ + buffer_capture_scan(); \ +} while(0) +#else #define END_CRIT_SECTION() \ do { \ Assert(CritSectionCount > 0); \ CritSectionCount--; \ } while(0) +#endif /***************************************************************************** diff --git a/src/include/storage/bufcapt.h b/src/include/storage/bufcapt.h new file mode 100644 index 0000000..089e5a7 --- /dev/null +++ b/src/include/storage/bufcapt.h @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * bufcaptr.h + * Buffer capture definitions. + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bufcapt.h + * + * About BUFFER_CAPTURE: + * + * If this symbol is defined, all page images that are logged on this + * server are as well flushed to BUFFER_CAPTURE_FILE. One line of the + * output file is used for a single page image. + * + * The page images obtained are aimed to be used with the utility tool + * called buffer_capture_cmp available in contrib/ and can be used to + * compare how WAL is replayed between master and standby nodes, helping + * in spotting bugs in this area. As each page is written in hexadecimal + * format, one single page writes BLCKSZ * 2 bytes to the capture file. + * Hexadecimal format makes it easier to spot differences between captures + * done among nodes. Be aware that this has a large impact on I/O and that + * it is aimed only for buildfarm and development purposes. + * + * One single page entry has the following format: + * LSN: %08X/08X page: PAGE_IN_HEXA + * + * The LSN corresponds to the log sequence number to which the page image + * applies to, then the content of the image is added as-is. This format + * is chosen to facilitate comparisons between each capture entry + * particularly in cases where LSN increases its digit number. + * + *------------------------------------------------------------------------- + */ +#ifndef BUFCAPT_H +#define BUFCAPT_H + +#ifdef BUFFER_CAPTURE + +#include "postgres.h" + +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/relfilenode.h" + +/* + * Output file where buffer captures are stored + */ +#define BUFFER_CAPTURE_FILE "buffer_captures" + +void buffer_capture_init(void); +void buffer_capture_reset(void); +void buffer_capture_write(char *newcontent, + uint32 blkno); + +void buffer_capture_remember(Buffer buffer); +void buffer_capture_forget(Buffer buffer); +void buffer_capture_scan(void); + +#endif /* BUFFER_CAPTURE */ + +#endif -- 2.0.0