>From 1752751903a8d51b7b3b618072b6b0687f9f141c Mon Sep 17 00:00:00 2001 From: Jim Nasby Date: Thu, 6 Nov 2014 14:42:52 -0600 Subject: [PATCH] Vacuum cleanup lock retry This patch will retry failed attempts to obtain the cleanup lock on a buffer. It remembers failed block numbers in an array and retries after vacuuming the relation. The array is currently fixed at 512 entries; additional lock failures will not be re-attempted. This patch also adds counters to report on failures, as well as refactoring the guts of page vacuum scans into it's own function. --- src/backend/commands/vacuumlazy.c | 964 +++++++++++++++++++++----------------- 1 file changed, 541 insertions(+), 423 deletions(-) diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 3778d9d..240113f 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -96,6 +96,14 @@ */ #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) +/* + * Instead of blindly skipping pages that we can't immediately acquire a + * cleanup lock for (assuming we're not freezing), we keep a list of pages we + * initially skipped, up to VACUUM_MAX_RETRY_PAGES. We retry those pages at the + * end of vacuuming. + */ +#define VACUUM_MAX_RETRY_PAGES 512 + typedef struct LVRelStats { /* hasindex = true means two-pass strategy; false means one-pass */ @@ -143,6 +151,10 @@ static void lazy_vacuum_index(Relation indrel, static void lazy_cleanup_index(Relation indrel, IndexBulkDeleteResult *stats, LVRelStats *vacrelstats); +static void lazy_scan_page(Relation onerel, LVRelStats *vacrelstats, + BlockNumber blkno, Buffer buf, Buffer vmbuffer, xl_heap_freeze_tuple *frozen, + int nindexes, bool all_visible_according_to_vm, + BlockNumber *empty_pages, BlockNumber *vacuumed_pages, double *nunused); static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); @@ -422,13 +434,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { BlockNumber nblocks, blkno; - HeapTupleData tuple; char *relname; BlockNumber empty_pages, - vacuumed_pages; - double num_tuples, - tups_vacuumed, - nkeep, + vacuumed_pages, + retry_pages[VACUUM_MAX_RETRY_PAGES]; + int retry_pages_insert_ptr; + double retry_page_count, + retry_fail_count, + retry_pages_skipped, + cleanup_lock_waits, nunused; IndexBulkDeleteResult **indstats; int i; @@ -446,8 +460,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, get_namespace_name(RelationGetNamespace(onerel)), relname))); - empty_pages = vacuumed_pages = 0; - num_tuples = tups_vacuumed = nkeep = nunused = 0; + empty_pages = vacuumed_pages = retry_pages_insert_ptr = retry_page_count = + retry_fail_count = retry_pages_skipped = cleanup_lock_waits = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); @@ -508,18 +522,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; - Page page; - OffsetNumber offnum, - maxoff; - bool tupgone, - hastup; - int prev_dead_count; - int nfrozen; - Size freespace; bool all_visible_according_to_vm; - bool all_visible; - bool has_dead_tuples; - TransactionId visibility_cutoff_xid = InvalidTransactionId; if (blkno == next_not_all_visible_block) { @@ -617,6 +620,19 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, */ if (!scan_all) { + /* + * Remember the page that we're skipping, but only if there's + * still room. + * + * XXX it would be even better if we retried as soon as we + * filled retry_pages, but we should get very few retry pages + * anyway so lets not go overboard. + */ + if (retry_pages_insert_ptrscanned_pages++; - - page = BufferGetPage(buf); - - if (PageIsNew(page)) - { - /* - * An all-zeroes page could be left over if a backend extends the - * relation but crashes before initializing the page. Reclaim such - * pages for use. - * - * We have to be careful here because we could be looking at a - * page that someone has just added to the relation and not yet - * been able to initialize (see RelationGetBufferForTuple). To - * protect against that, release the buffer lock, grab the - * relation extension lock momentarily, and re-lock the buffer. If - * the page is still uninitialized by then, it must be left over - * from a crashed backend, and we can initialize it. - * - * We don't really need the relation lock when this is a new or - * temp relation, but it's probably not worth the code space to - * check that, since this surely isn't a critical path. - * - * Note: the comparable code in vacuum.c need not worry because - * it's got exclusive lock on the whole relation. - */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockRelationForExtension(onerel, ExclusiveLock); - UnlockRelationForExtension(onerel, ExclusiveLock); - LockBufferForCleanup(buf); - if (PageIsNew(page)) - { - ereport(WARNING, - (errmsg("relation \"%s\" page %u is uninitialized --- fixing", - relname, blkno))); - PageInit(page, BufferGetPageSize(buf), 0); - empty_pages++; - } - freespace = PageGetHeapFreeSpace(page); - MarkBufferDirty(buf); - UnlockReleaseBuffer(buf); - - RecordPageWithFreeSpace(onerel, blkno, freespace); - continue; - } - - if (PageIsEmpty(page)) - { - empty_pages++; - freespace = PageGetHeapFreeSpace(page); - - /* empty pages are always all-visible */ - if (!PageIsAllVisible(page)) - { - START_CRIT_SECTION(); - - /* mark buffer dirty before writing a WAL record */ - MarkBufferDirty(buf); - - /* - * It's possible that another backend has extended the heap, - * initialized the page, and then failed to WAL-log the page - * due to an ERROR. Since heap extension is not WAL-logged, - * recovery might try to replay our record setting the page - * all-visible and find that the page isn't initialized, which - * will cause a PANIC. To prevent that, check whether the - * page has been previously WAL-logged, and if not, do that - * now. - */ - if (RelationNeedsWAL(onerel) && - PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); - - PageSetAllVisible(page); - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId); - END_CRIT_SECTION(); - } - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, blkno, freespace); - continue; - } - - /* - * Prune all HOT-update chains in this page. - * - * We count tuples removed by the pruning step as removed by VACUUM. - */ - tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, - &vacrelstats->latestRemovedXid); + lazy_scan_page(onerel, vacrelstats, blkno, buf, vmbuffer, frozen, + nindexes, all_visible_according_to_vm, + &empty_pages, &vacuumed_pages, &nunused); + } - /* - * Now scan the page to collect vacuumable items and check for tuples - * requiring freezing. - */ - all_visible = true; - has_dead_tuples = false; - nfrozen = 0; - hastup = false; - prev_dead_count = vacrelstats->num_dead_tuples; - maxoff = PageGetMaxOffsetNumber(page); + /* + * Make a second attempt to acquire the cleanup lock on pages we skipped. + * Note that we don't have to worry about !scan_all here. + */ - /* - * Note: If you change anything in the loop below, also look at - * heap_page_is_all_visible to see if that needs to be changed. - */ - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) + if (retry_pages_insert_ptr) + { + for (i = 0; i < retry_pages_insert_ptr; i++) { - ItemId itemid; - - itemid = PageGetItemId(page, offnum); - - /* Unused items require no processing, but we count 'em */ - if (!ItemIdIsUsed(itemid)) - { - nunused += 1; - continue; - } - - /* Redirect items mustn't be touched */ - if (ItemIdIsRedirected(itemid)) - { - hastup = true; /* this page won't be truncatable */ - continue; - } - - ItemPointerSet(&(tuple.t_self), blkno, offnum); - - /* - * DEAD item pointers are to be vacuumed normally; but we don't - * count them in tups_vacuumed, else we'd be double-counting (at - * least in the common case where heap_page_prune() just freed up - * a non-HOT tuple). - */ - if (ItemIdIsDead(itemid)) - { - lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); - all_visible = false; - continue; - } - - Assert(ItemIdIsNormal(itemid)); - - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(onerel); - - tupgone = false; - - switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) - { - case HEAPTUPLE_DEAD: - - /* - * Ordinarily, DEAD tuples would have been removed by - * heap_page_prune(), but it's possible that the tuple - * state changed since heap_page_prune() looked. In - * particular an INSERT_IN_PROGRESS tuple could have - * changed to DEAD if the inserter aborted. So this - * cannot be considered an error condition. - * - * If the tuple is HOT-updated then it must only be - * removed by a prune operation; so we keep it just as if - * it were RECENTLY_DEAD. Also, if it's a heap-only - * tuple, we choose to keep it, because it'll be a lot - * cheaper to get rid of it in the next pruning pass than - * to treat it like an indexed tuple. - */ - if (HeapTupleIsHotUpdated(&tuple) || - HeapTupleIsHeapOnly(&tuple)) - nkeep += 1; - else - tupgone = true; /* we can delete the tuple */ - all_visible = false; - break; - case HEAPTUPLE_LIVE: - /* Tuple is good --- but let's do some validity checks */ - if (onerel->rd_rel->relhasoids && - !OidIsValid(HeapTupleGetOid(&tuple))) - elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", - relname, blkno, offnum); - - /* - * Is the tuple definitely visible to all transactions? - * - * NB: Like with per-tuple hint bits, we can't set the - * PD_ALL_VISIBLE flag if the inserter committed - * asynchronously. See SetHintBits for more info. Check - * that the tuple is hinted xmin-committed because of - * that. - */ - if (all_visible) - { - TransactionId xmin; - - if (!HeapTupleHeaderXminCommitted(tuple.t_data)) - { - all_visible = false; - break; - } - - /* - * The inserter definitely committed. But is it old - * enough that everyone sees it as committed? - */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, OldestXmin)) - { - all_visible = false; - break; - } - - /* Track newest xmin on page. */ - if (TransactionIdFollows(xmin, visibility_cutoff_xid)) - visibility_cutoff_xid = xmin; - } - break; - case HEAPTUPLE_RECENTLY_DEAD: + Buffer buf; + blkno = retry_pages[i]; - /* - * If tuple is recently deleted then we must not remove it - * from relation. - */ - nkeep += 1; - all_visible = false; - break; - case HEAPTUPLE_INSERT_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ - all_visible = false; - break; - case HEAPTUPLE_DELETE_IN_PROGRESS: - /* This is an expected case during concurrent vacuum */ - all_visible = false; - break; - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - if (tupgone) - { - lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); - HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, - &vacrelstats->latestRemovedXid); - tups_vacuumed += 1; - has_dead_tuples = true; - } - else - { - num_tuples += 1; - hastup = true; - - /* - * Each non-removable tuple must be checked to see if it needs - * freezing. Note we already have exclusive buffer lock. - */ - if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, - MultiXactCutoff, &frozen[nfrozen])) - frozen[nfrozen++].offset = offnum; - } - } /* scan along page */ + visibilitymap_pin(onerel, blkno, &vmbuffer); - /* - * If we froze any tuples, mark the buffer dirty, and write a WAL - * record recording the changes. We must log the changes to be - * crash-safe against future truncation of CLOG. - */ - if (nfrozen > 0) - { - START_CRIT_SECTION(); - - MarkBufferDirty(buf); + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, + RBM_NORMAL, vac_strategy); - /* execute collected freezes */ - for (i = 0; i < nfrozen; i++) + /* We need buffer cleanup lock so that we can prune HOT chains. */ + if (ConditionalLockBufferForCleanup(buf)) { - ItemId itemid; - HeapTupleHeader htup; + retry_page_count++; - itemid = PageGetItemId(page, frozen[i].offset); - htup = (HeapTupleHeader) PageGetItem(page, itemid); - - heap_execute_freeze_tuple(htup, &frozen[i]); - } - - /* Now WAL-log freezing if neccessary */ - if (RelationNeedsWAL(onerel)) + lazy_scan_page(onerel, vacrelstats, blkno, buf, vmbuffer, frozen, + nindexes, visibilitymap_test(onerel, blkno, &vmbuffer), + &empty_pages, &vacuumed_pages, &nunused); + } else { - XLogRecPtr recptr; - - recptr = log_heap_freeze(onerel, buf, FreezeLimit, - frozen, nfrozen); - PageSetLSN(page, recptr); + retry_fail_count++; } - - END_CRIT_SECTION(); - } - - /* - * If there are no indexes then we can vacuum the page right now - * instead of doing a second scan. - */ - if (nindexes == 0 && - vacrelstats->num_dead_tuples > 0) - { - /* Remove tuples from heap */ - lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); - has_dead_tuples = false; - - /* - * Forget the now-vacuumed tuples, and press on, but be careful - * not to reset latestRemovedXid since we want that value to be - * valid. - */ - vacrelstats->num_dead_tuples = 0; - vacuumed_pages++; - } - - freespace = PageGetHeapFreeSpace(page); - - /* mark page all-visible, if appropriate */ - if (all_visible && !all_visible_according_to_vm) - { - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed - * (if checksums are not enabled). Regardless, set the both bits - * so that we get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, - * we don't need to dirty the heap page. However, if checksums - * are enabled, we do need to make sure that the heap page is - * dirtied before passing it to visibilitymap_set(), because it - * may be logged. Given that this situation should only happen in - * rare cases after a crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid); - } - - /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. However, it's possible that the bit - * got cleared after we checked it and before we took the buffer - * content lock, so we must recheck before jumping to the conclusion - * that something bad has happened. - */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && visibilitymap_test(onerel, blkno, &vmbuffer)) - { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - relname, blkno); - visibilitymap_clear(onerel, blkno, vmbuffer); } - - /* - * It's possible for the value returned by GetOldestXmin() to move - * backwards, so it's not wrong for us to see tuples that appear to - * not be visible to everyone yet, while PD_ALL_VISIBLE is already - * set. The real safe xmin value never moves backwards, but - * GetOldestXmin() is conservative and sometimes returns a value - * that's unnecessarily small, so if we see that contradiction it just - * means that the tuples that we think are not visible to everyone yet - * actually are, and the PD_ALL_VISIBLE flag is correct. - * - * There should never be dead tuples on a page with PD_ALL_VISIBLE - * set, however. - */ - else if (PageIsAllVisible(page) && has_dead_tuples) - { - elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", - relname, blkno); - PageClearAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_clear(onerel, blkno, vmbuffer); - } - - UnlockReleaseBuffer(buf); - - /* Remember the location of the last page with nonremovable tuples */ - if (hastup) - vacrelstats->nonempty_pages = blkno + 1; - - /* - * If we remembered any tuples for deletion, then the page will be - * visited again by lazy_vacuum_heap, which will compute and record - * its post-compaction free space. If not, then we're done with this - * page, so remember its free space as-is. (This path will always be - * taken if there are no indexes.) - */ - if (vacrelstats->num_dead_tuples == prev_dead_count) - RecordPageWithFreeSpace(onerel, blkno, freespace); } pfree(frozen); - /* save stats for use later */ - vacrelstats->scanned_tuples = num_tuples; - vacrelstats->tuples_deleted = tups_vacuumed; - vacrelstats->new_dead_tuples = nkeep; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, - num_tuples); + vacrelstats->scanned_tuples); /* * Release any remaining pin on visibility map page. @@ -1077,6 +728,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); + /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; @@ -1091,21 +743,57 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), - tups_vacuumed, vacuumed_pages))); + vacrelstats->tuples_deleted, vacuumed_pages))); - ereport(elevel, - (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", - RelationGetRelationName(onerel), - tups_vacuumed, num_tuples, - vacrelstats->scanned_pages, nblocks), - errdetail("%.0f dead row versions cannot be removed yet.\n" - "There were %.0f unused item pointers.\n" - "%u pages are entirely empty.\n" - "%s.", - nkeep, - nunused, - empty_pages, - pg_rusage_show(&ru0)))); + if (retry_page_count || retry_fail_count || retry_pages_skipped) + ereport(elevel, + (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", + RelationGetRelationName(onerel), + vacrelstats->tuples_deleted, vacrelstats->scanned_tuples, + vacrelstats->scanned_pages, nblocks), + errdetail("%.0f dead row versions cannot be removed yet.\n" + "There were %.0f unused item pointers.\n" + "%u pages are entirely empty.\n" + "Retried cleanup lock on %.0f pages, retry failed on %.0f, skipped retry on %.0f.\n" + "%s.", + vacrelstats->new_dead_tuples, + nunused, + empty_pages, + retry_page_count, retry_fail_count, retry_pages_skipped, + pg_rusage_show(&ru0)) + )); + else if (cleanup_lock_waits) + ereport(elevel, + (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", + RelationGetRelationName(onerel), + vacrelstats->tuples_deleted, vacrelstats->scanned_tuples, + vacrelstats->scanned_pages, nblocks), + errdetail("%.0f dead row versions cannot be removed yet.\n" + "There were %.0f unused item pointers.\n" + "%u pages are entirely empty.\n" + "Waited for cleanup lock on %.0f pages.\n" + "%s.", + vacrelstats->new_dead_tuples, + nunused, + empty_pages, + cleanup_lock_waits, + pg_rusage_show(&ru0)) + )); + else + ereport(elevel, + (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", + RelationGetRelationName(onerel), + vacrelstats->tuples_deleted, vacrelstats->scanned_tuples, + vacrelstats->scanned_pages, nblocks), + errdetail("%.0f dead row versions cannot be removed yet.\n" + "There were %.0f unused item pointers.\n" + "%u pages are entirely empty.\n" + "%s.", + vacrelstats->new_dead_tuples, + nunused, + empty_pages, + pg_rusage_show(&ru0)) + )); } @@ -1175,6 +863,436 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) errdetail("%s.", pg_rusage_show(&ru0)))); } +/* + * lazy_scan_page() - scan a single page for dead tuples + * + * This is broken out from lazy_scan_heap() so that we can retry cleaning pages + * that we couldn't get the cleanup lock on. Caller must have a cleanup lock on + * the heap buffer (buf), and have the appropriate visibility map buffer + * (vmbuffer) pinned. + * + */ +static void +lazy_scan_page(Relation onerel, LVRelStats *vacrelstats, + BlockNumber blkno, Buffer buf, Buffer vmbuffer, xl_heap_freeze_tuple *frozen, + int nindexes, bool all_visible_according_to_vm, + BlockNumber *empty_pages, BlockNumber *vacuumed_pages, double *nunused) +{ + int nfrozen = 0; + int i; + Page page; + OffsetNumber offnum, + maxoff; + HeapTupleData tuple; + bool all_visible = true; + bool has_dead_tuples = false; + bool hastup = false; + bool tupgone; + char *relname = RelationGetRelationName(onerel); + Size freespace; + TransactionId visibility_cutoff_xid = InvalidTransactionId; + int prev_dead_count = vacrelstats->num_dead_tuples; + + /* + * I don't see a way to check onerel against buf or vmbuffer without + * BufferGetTag, which seems like overkill. + */ + Assert(BufferGetBlockNumber(buf) == blkno); + Assert(visibilitymap_pin_ok(blkno, vmbuffer)); + + vacrelstats->scanned_pages++; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + { + /* + * An all-zeroes page could be left over if a backend extends the + * relation but crashes before initializing the page. Reclaim such + * pages for use. + * + * We have to be careful here because we could be looking at a + * page that someone has just added to the relation and not yet + * been able to initialize (see RelationGetBufferForTuple). To + * protect against that, release the buffer lock, grab the + * relation extension lock momentarily, and re-lock the buffer. If + * the page is still uninitialized by then, it must be left over + * from a crashed backend, and we can initialize it. + * + * We don't really need the relation lock when this is a new or + * temp relation, but it's probably not worth the code space to + * check that, since this surely isn't a critical path. + * + * Note: the comparable code in vacuum.c need not worry because + * it's got exclusive lock on the whole relation. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockRelationForExtension(onerel, ExclusiveLock); + UnlockRelationForExtension(onerel, ExclusiveLock); + LockBufferForCleanup(buf); + if (PageIsNew(page)) + { + ereport(WARNING, + (errmsg("relation \"%s\" page %u is uninitialized --- fixing", + relname, blkno))); + PageInit(page, BufferGetPageSize(buf), 0); + empty_pages++; + } + freespace = PageGetHeapFreeSpace(page); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); + + RecordPageWithFreeSpace(onerel, blkno, freespace); + return; + } + + if (PageIsEmpty(page)) + { + empty_pages++; + freespace = PageGetHeapFreeSpace(page); + + /* empty pages are always all-visible */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page + * due to an ERROR. Since heap extension is not WAL-logged, + * recovery might try to replay our record setting the page + * all-visible and find that the page isn't initialized, which + * will cause a PANIC. To prevent that, check whether the + * page has been previously WAL-logged, and if not, do that + * now. + */ + if (RelationNeedsWAL(onerel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId); + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, blkno, freespace); + return; + } + + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as removed by VACUUM. + */ + vacrelstats->tuples_deleted += heap_page_prune(onerel, buf, OldestXmin, false, + &vacrelstats->latestRemovedXid); + + /* + * Now scan the page to collect vacuumable items and check for tuples + * requiring freezing. + */ + + /* + * Note: If you change anything in the loop below, also look at + * heap_page_is_all_visible to see if that needs to be changed. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* Unused items require no processing, but we count 'em */ + if (!ItemIdIsUsed(itemid)) + { + nunused += 1; + continue; + } + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + hastup = true; /* this page won't be truncatable */ + continue; + } + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in vacrelstats->tuples_deleted, else we'd be double-counting (at + * least in the common case where heap_page_prune() just freed up + * a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + all_visible = false; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(onerel); + + tupgone = false; + + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + + /* + * Ordinarily, DEAD tuples would have been removed by + * heap_page_prune(), but it's possible that the tuple + * state changed since heap_page_prune() looked. In + * particular an INSERT_IN_PROGRESS tuple could have + * changed to DEAD if the inserter aborted. So this + * cannot be considered an error condition. + * + * If the tuple is HOT-updated then it must only be + * removed by a prune operation; so we keep it just as if + * it were RECENTLY_DEAD. Also, if it's a heap-only + * tuple, we choose to keep it, because it'll be a lot + * cheaper to get rid of it in the next pruning pass than + * to treat it like an indexed tuple. + */ + if (HeapTupleIsHotUpdated(&tuple) || + HeapTupleIsHeapOnly(&tuple)) + vacrelstats->new_dead_tuples += 1; + else + tupgone = true; /* we can delete the tuple */ + all_visible = false; + break; + case HEAPTUPLE_LIVE: + /* Tuple is good --- but let's do some validity checks */ + if (onerel->rd_rel->relhasoids && + !OidIsValid(HeapTupleGetOid(&tuple))) + elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", + relname, blkno, offnum); + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check + * that the tuple is hinted xmin-committed because of + * that. + */ + if (all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old + * enough that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, visibility_cutoff_xid)) + visibility_cutoff_xid = xmin; + } + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must not remove it + * from relation. + */ + vacrelstats->new_dead_tuples += 1; + all_visible = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + all_visible = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + all_visible = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + if (tupgone) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, + &vacrelstats->latestRemovedXid); + vacrelstats->tuples_deleted += 1; + has_dead_tuples = true; + } + else + { + vacrelstats->scanned_tuples += 1; + hastup = true; + + /* + * Each non-removable tuple must be checked to see if it needs + * freezing. Note we already have exclusive buffer lock. + */ + if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, + MultiXactCutoff, &frozen[nfrozen])) + frozen[nfrozen++].offset = offnum; + } + } /* scan along page */ + + /* + * If we froze any tuples, mark the buffer dirty, and write a WAL + * record recording the changes. We must log the changes to be + * crash-safe against future truncation of CLOG. + */ + if (nfrozen > 0) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buf); + + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + ItemId itemid; + HeapTupleHeader htup; + + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + heap_execute_freeze_tuple(htup, &frozen[i]); + } + + /* Now WAL-log freezing if neccessary */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; + + recptr = log_heap_freeze(onerel, buf, FreezeLimit, + frozen, nfrozen); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * If there are no indexes then we can vacuum the page right now + * instead of doing a second scan. + */ + if (nindexes == 0 && + vacrelstats->num_dead_tuples > 0) + { + /* Remove tuples from heap */ + lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); + has_dead_tuples = false; + + /* + * Forget the now-vacuumed tuples, and press on, but be careful + * not to reset latestRemovedXid since we want that value to be + * valid. + */ + vacrelstats->num_dead_tuples = 0; + vacuumed_pages++; + } + + freespace = PageGetHeapFreeSpace(page); + + /* mark page all-visible, if appropriate */ + if (all_visible && !all_visible_according_to_vm) + { + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set the both bits + * so that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid); + } + + /* + * As of PostgreSQL 9.2, the visibility map bit should never be set if + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. + */ + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && visibilitymap_test(onerel, blkno, &vmbuffer)) + { + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + relname, blkno); + visibilitymap_clear(onerel, blkno, vmbuffer); + } + + /* + * It's possible for the value returned by GetOldestXmin() to move + * backwards, so it's not wrong for us to see tuples that appear to + * not be visible to everyone yet, while PD_ALL_VISIBLE is already + * set. The real safe xmin value never moves backwards, but + * GetOldestXmin() is conservative and sometimes returns a value + * that's unnecessarily small, so if we see that contradiction it just + * means that the tuples that we think are not visible to everyone yet + * actually are, and the PD_ALL_VISIBLE flag is correct. + * + * There should never be dead tuples on a page with PD_ALL_VISIBLE + * set, however. + */ + else if (PageIsAllVisible(page) && has_dead_tuples) + { + elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", + relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_clear(onerel, blkno, vmbuffer); + } + + UnlockReleaseBuffer(buf); + + /* Remember the location of the last page with nonremovable tuples */ + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + + /* + * If we remembered any tuples for deletion, then the page will be + * visited again by lazy_vacuum_heap, which will compute and record + * its post-compaction free space. If not, then we're done with this + * page, so remember its free space as-is. (This path will always be + * taken if there are no indexes.) + */ + if (vacrelstats->num_dead_tuples == prev_dead_count) + RecordPageWithFreeSpace(onerel, blkno, freespace); +} /* * lazy_vacuum_page() -- free dead tuples on a page -- 2.1.2