From 49582f6707611a572b441bf692fd925e9d658781 Mon Sep 17 00:00:00 2001 From: amit Date: Wed, 26 Jul 2017 14:42:47 +0900 Subject: [PATCH 3/3] WIP: Defer opening and locking partitions to set_append_rel_size --- src/backend/catalog/partition.c | 20 ++ src/backend/nodes/copyfuncs.c | 17 -- src/backend/nodes/equalfuncs.c | 12 -- src/backend/nodes/outfuncs.c | 57 +++++- src/backend/optimizer/path/allpaths.c | 357 +++++++++++++++++++++++++++++++-- src/backend/optimizer/plan/planner.c | 106 ++++++++-- src/backend/optimizer/prep/prepunion.c | 266 +++++++++++++++--------- src/backend/optimizer/util/plancat.c | 44 ++++ src/backend/optimizer/util/relnode.c | 81 +++++++- src/backend/utils/cache/lsyscache.c | 50 +++++ src/include/catalog/partition.h | 4 + src/include/nodes/nodes.h | 5 +- src/include/nodes/relation.h | 93 +++++++-- src/include/optimizer/plancat.h | 1 + src/include/optimizer/prep.h | 3 + src/include/utils/lsyscache.h | 2 + src/test/regress/expected/insert.out | 4 +- 17 files changed, 938 insertions(+), 184 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index c972760fe4..41127a584e 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1161,6 +1161,26 @@ RelationGetPartitionDispatchInfo(Relation rel, Assert((offset + 1) == list_length(*ptinfos)); } +/* + * get_partitions_for_keys + * Returns the list of indexes (from pd->indexes) of the partitions that + * will need to be scanned for the given scan keys. + * + * TODO: add the interface to pass the query scan keys and the logic to look + * up partitions using those keys. + */ +List * +get_partitions_for_keys(PartitionDispatch pd) +{ + int i; + List *result = NIL; + + for (i = 0; i < pd->partdesc->nparts; i++) + result = lappend_int(result, pd->indexes[i]); + + return result; +} + /* Module-local functions */ /* diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 72041693df..8d17d7f52c 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2249,20 +2249,6 @@ _copyAppendRelInfo(const AppendRelInfo *from) } /* - * _copyPartitionedChildRelInfo - */ -static PartitionedChildRelInfo * -_copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) -{ - PartitionedChildRelInfo *newnode = makeNode(PartitionedChildRelInfo); - - COPY_SCALAR_FIELD(parent_relid); - COPY_NODE_FIELD(child_rels); - - return newnode; -} - -/* * _copyPlaceHolderInfo */ static PlaceHolderInfo * @@ -4994,9 +4980,6 @@ copyObjectImpl(const void *from) case T_AppendRelInfo: retval = _copyAppendRelInfo(from); break; - case T_PartitionedChildRelInfo: - retval = _copyPartitionedChildRelInfo(from); - break; case T_PlaceHolderInfo: retval = _copyPlaceHolderInfo(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 8d92c03633..fb248f31f3 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -905,15 +905,6 @@ _equalAppendRelInfo(const AppendRelInfo *a, const AppendRelInfo *b) } static bool -_equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const PartitionedChildRelInfo *b) -{ - COMPARE_SCALAR_FIELD(parent_relid); - COMPARE_NODE_FIELD(child_rels); - - return true; -} - -static bool _equalPlaceHolderInfo(const PlaceHolderInfo *a, const PlaceHolderInfo *b) { COMPARE_SCALAR_FIELD(phid); @@ -3155,9 +3146,6 @@ equal(const void *a, const void *b) case T_AppendRelInfo: retval = _equalAppendRelInfo(a, b); break; - case T_PartitionedChildRelInfo: - retval = _equalPartitionedChildRelInfo(a, b); - break; case T_PlaceHolderInfo: retval = _equalPlaceHolderInfo(a, b); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 5ce3c7c599..1c7caca013 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2211,7 +2211,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_NODE_FIELD(full_join_clauses); WRITE_NODE_FIELD(join_info_list); WRITE_NODE_FIELD(append_rel_list); - WRITE_NODE_FIELD(pcinfo_list); + WRITE_NODE_FIELD(prinfo_list); WRITE_NODE_FIELD(rowMarks); WRITE_NODE_FIELD(placeholder_list); WRITE_NODE_FIELD(fkey_list); @@ -2285,6 +2285,12 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_NODE_FIELD(joininfo); WRITE_BOOL_FIELD(has_eclass_joins); WRITE_BITMAPSET_FIELD(top_parent_relids); + WRITE_INT_FIELD(num_parted); + /* don't bother printing partition_infos */ + WRITE_INT_FIELD(num_leaf_parts); + /* don't bother printing leaf_part_infos */ + WRITE_NODE_FIELD(live_partition_painfos); + WRITE_UINT_FIELD(root_parent_relid); } static void @@ -2510,12 +2516,42 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node) } static void -_outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) +_outPartitionInfo(StringInfo str, const PartitionInfo *node) { - WRITE_NODE_TYPE("PARTITIONEDCHILDRELINFO"); + WRITE_NODE_TYPE("PARTITIONINFO"); + + WRITE_UINT_FIELD(relid); + /* Don't bother writing out the PartitionDispatch object */ +} + +static void +_outLeafPartitionInfo(StringInfo str, const LeafPartitionInfo *node) +{ + WRITE_NODE_TYPE("LEAFPARTITIONINFO"); + + WRITE_OID_FIELD(reloid); + WRITE_UINT_FIELD(relid); +} + +static void +_outPartitionAppendInfo(StringInfo str, const PartitionAppendInfo *node) +{ + WRITE_NODE_TYPE("PARTITIONAPPENDINFO"); + + WRITE_UINT_FIELD(parent_relid); + WRITE_NODE_FIELD(live_partition_relids); +} + +static void +_outPartitionRootInfo(StringInfo str, const PartitionRootInfo *node) +{ + WRITE_NODE_TYPE("PARTITIONROOTINFO"); WRITE_UINT_FIELD(parent_relid); - WRITE_NODE_FIELD(child_rels); + WRITE_NODE_FIELD(partition_infos); + WRITE_NODE_FIELD(partitioned_relids); + WRITE_NODE_FIELD(leaf_part_infos); + WRITE_NODE_FIELD(orig_leaf_part_oids); } static void @@ -4043,8 +4079,17 @@ outNode(StringInfo str, const void *obj) case T_AppendRelInfo: _outAppendRelInfo(str, obj); break; - case T_PartitionedChildRelInfo: - _outPartitionedChildRelInfo(str, obj); + case T_PartitionInfo: + _outPartitionInfo(str, obj); + break; + case T_LeafPartitionInfo: + _outLeafPartitionInfo(str, obj); + break; + case T_PartitionAppendInfo: + _outPartitionAppendInfo(str, obj); + break; + case T_PartitionRootInfo: + _outPartitionRootInfo(str, obj); break; case T_PlaceHolderInfo: _outPlaceHolderInfo(str, obj); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 2d7e1d84d0..c9c0b85cd9 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -20,6 +20,7 @@ #include "access/sysattr.h" #include "access/tsmapi.h" +#include "catalog/partition.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -43,6 +44,8 @@ #include "parser/parse_clause.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" @@ -334,7 +337,7 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel, */ set_dummy_rel_pathlist(rel); } - else if (rte->inh) + else if (rte->inh || rte->relkind == RELKIND_PARTITIONED_TABLE) { /* It's an "append relation", process accordingly */ set_append_rel_size(root, rel, rti, rte); @@ -425,7 +428,7 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, { /* We already proved the relation empty, so nothing more to do */ } - else if (rte->inh) + else if (rte->inh || rte->relkind == RELKIND_PARTITIONED_TABLE) { /* It's an "append relation", process accordingly */ set_append_rel_pathlist(root, rel, rti, rte); @@ -845,6 +848,166 @@ set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) } /* + * get_partitions_recurse + * Find partitions of the partitioned table described in partinfo, + * recursing for those partitions that are themselves partitioned tables + * + * rootrel is the root of the partition tree of which this table is a part. + * We create a PartitionAppendInfo for this partitioned table and append it to + * rootrel->live_partition_painfos. + * + * List of the leaf partitions of this table will be returned. + */ +static List * +get_rel_partitions_recurse(RelOptInfo *rootrel, + PartitionInfo *partinfo, + PartitionInfo **all_partinfos, + LeafPartitionInfo **leaf_part_infos) +{ + PartitionAppendInfo *painfo; + List *indexes; + List *result = NIL, + *my_live_partitions = NIL; + ListCell *l; + + /* + * Create a PartitionAppendInfo to map this table to the child tables + * that will be its Append children. + */ + painfo = makeNode(PartitionAppendInfo); + painfo->parent_relid = partinfo->relid; + + /* They will all be under the root table's Append node. */ + rootrel->live_partition_painfos = lappend(rootrel->live_partition_painfos, + painfo); + + /* + * TODO: collect the keys by looking at the clauses in + * rootrel->baserestrictinfo considering this table's partition keys. + */ + + /* Ask partition.c which partitions it thinks match the keys. */ + indexes = get_partitions_for_keys(partinfo->pd); + + /* Collect leaf partitions in the result list and recurse for others. */ + foreach(l, indexes) + { + int index = lfirst_int(l); + + if (index >= 0) + { + LeafPartitionInfo *lpinfo = leaf_part_infos[index]; + + result = lappend_oid(result, lpinfo->reloid); + my_live_partitions = lappend_int(my_live_partitions, + lpinfo->relid); + } + else + { + PartitionInfo *recurse_partinfo = all_partinfos[-index]; + List *my_leaf_partitions; + + my_live_partitions = lappend_int(my_live_partitions, + recurse_partinfo->relid); + my_leaf_partitions = get_rel_partitions_recurse(rootrel, + recurse_partinfo, + all_partinfos, + leaf_part_infos); + result = list_concat(result, my_leaf_partitions); + } + } + + painfo->live_partition_relids = my_live_partitions; + + return result; +} + +/* + * get_rel_partitions + * Recursively find partitions of rel + */ +static List * +get_rel_partitions(RelOptInfo *rel) +{ + return get_rel_partitions_recurse(rel, + rel->partition_infos[0], + rel->partition_infos, + rel->leaf_part_infos); +} + +/* + * find_rel_partitions + * Find and lock partitions of rel relevant to this query + * + * Note that we only ever need to lock the leaf partitions, because the + * partitioned tables in the partition tree have already been locked. + */ +static void +find_partitions_for_query(PlannerInfo *root, RelOptInfo *rel) +{ + List *leaf_part_oids = NIL; + ListCell *l; + PlanRowMark *rc = NULL; + int lockmode; + int num_leaf_parts, + i; + Oid *leaf_part_oids_array; + PartitionRootInfo *prinfo = NULL; + + /* Find partitions. */ + Assert(rel->partition_infos != NULL); + leaf_part_oids = get_rel_partitions(rel); + + /* Convert the list to an array and sort for binary searching later. */ + num_leaf_parts = list_length(leaf_part_oids); + leaf_part_oids_array = (Oid *) palloc(num_leaf_parts * sizeof(Oid)); + i = 0; + foreach(l, leaf_part_oids) + { + leaf_part_oids_array[i++] = lfirst_oid(l); + } + qsort(leaf_part_oids_array, num_leaf_parts, sizeof(Oid), oid_cmp); + + /* + * Now lock partitions. Note that rel cannot be a result relation or we + * wouldn't be here (inheritance_planner is where result relations go). + */ + rc = get_plan_rowmark(root->rowMarks, rel->relid); + if (rc && RowMarkRequiresRowShareLock(rc->markType)) + lockmode = RowShareLock; + else + lockmode = AccessShareLock; + + /* + * We lock leaf partitions in the order in which find_all_inheritors + * found them in expand_inherited_rtentry(). Find that list by locating + * the PartitionRootInfo for this table. + */ + foreach(l, root->prinfo_list) + { + prinfo = lfirst(l); + + if (rel->relid == prinfo->parent_relid) + break; + } + Assert(prinfo != NULL && rel->relid == prinfo->parent_relid); + foreach(l, prinfo->orig_leaf_part_oids) + { + Oid relid = lfirst_oid(l); + Oid *test; + + /* Will this leaf partition be scanned? */ + test = (Oid *) bsearch(&relid, + leaf_part_oids_array, + num_leaf_parts, + sizeof(Oid), oid_cmp); + /* Yep, so lock. */ + if (test != NULL) + LockRelationOid(relid, lockmode); + } +} + +/* * set_append_rel_size * Set size estimates for a simple "append relation" * @@ -866,6 +1029,134 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, double *parent_attrsizes; int nattrs; ListCell *l; + List *rel_appinfos = NIL; + + /* + * Collect a list child AppendRelInfo's, which in the non-partitioned + * case will be found in root->append_rel_list. In the partitioned + * table's case, we didn't build any AppendRelInfo's yet. We will + * do the same after figuring out which of the table's child tables + * (aka partitions) will need to be scanned for this query. + */ + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + { + List *live_partitions; + Relation parent; + List *parent_vars; + RelOptInfo *rootrel; + + /* + * If this is a partitioned table root, we will determine all the + * partitions in this partition tree that we need to scan for this + * query. Among those, partitions that have not yet been locked (viz. + * the leaf partitions), will be. + */ + if (rel->partition_infos != NULL) + { + PartitionAppendInfo *painfo; + + rootrel = rel; + find_partitions_for_query(root, rel); + painfo = linitial(rel->live_partition_painfos); + Assert(rti == painfo->parent_relid); + live_partitions = painfo->live_partition_relids; + } + else + { + /* + * Just need to get hold of the PartitionAppendInfo via the root + * parent's RelOptInfo. + */ + rootrel = root->simple_rel_array[rel->root_parent_relid]; + foreach(l, rootrel->live_partition_painfos) + { + PartitionAppendInfo *painfo = lfirst(l); + + if (rti == painfo->parent_relid) + { + live_partitions = painfo->live_partition_relids; + break; + } + } + } + + /* + * Create an AppendRelInfo and a RelOptInfo for every candidate + * partition. + */ + parent = heap_open(rte->relid, NoLock); + parent_vars = build_rel_vars(rte, rti); + foreach(l, live_partitions) + { + Index childRTindex = lfirst_int(l); + RangeTblEntry *childrte = planner_rt_fetch(childRTindex, root); + Relation child; + AppendRelInfo *appinfo; + RelOptInfo *childrel; + + child = heap_open(childrte->relid, NoLock); /* already locked! */ + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = rti; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = parent->rd_rel->reltype; + appinfo->child_reltype = child->rd_rel->reltype; + appinfo->translated_vars = map_partition_varattnos(parent_vars, + rti, + child, parent, + NULL); + ChangeVarNodes((Node *) appinfo->translated_vars, + rti, childRTindex, 0); + appinfo->parent_reloid = rte->relid; + rel_appinfos = lappend(rel_appinfos, appinfo); + root->append_rel_list = lappend(root->append_rel_list, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + childrte->selectedCols = translate_col_privs(rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = translate_col_privs(rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = translate_col_privs(rte->updatedCols, + appinfo->translated_vars); + + childrel = build_simple_rel(root, childRTindex, rel); + childrel->root_parent_relid = rootrel->relid; + Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); + + /* Copy the data that create_lateral_join_info() created */ + Assert(childrel->direct_lateral_relids == NULL); + childrel->direct_lateral_relids = rel->direct_lateral_relids; + Assert(childrel->lateral_relids == NULL); + childrel->lateral_relids = rel->lateral_relids; + Assert(childrel->lateral_referencers == NULL); + childrel->lateral_referencers = rel->lateral_referencers; + + root->total_table_pages += childrel->pages; + + heap_close(child, NoLock); + } + heap_close(parent, NoLock); + } Assert(IS_SIMPLE_REL(rel)); @@ -889,7 +1180,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, nattrs = rel->max_attr - rel->min_attr + 1; parent_attrsizes = (double *) palloc0(nattrs * sizeof(double)); - foreach(l, root->append_rel_list) + foreach(l, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); int childRTindex; @@ -902,10 +1193,6 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, ListCell *childvars; ListCell *lc; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - childRTindex = appinfo->child_relid; childRTE = root->simple_rte_array[childRTindex]; @@ -1211,24 +1498,61 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, int parentRTindex = rti; List *live_childrels = NIL; ListCell *l; + List *append_rel_children = NIL; + + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + append_rel_children = lappend_int(append_rel_children, + appinfo->child_relid); + } + } + else + { + /* For a partitioned table, first find its PartitionAppendInfo */ + if (rel->live_partition_painfos != NIL) + { + PartitionAppendInfo *painfo; + + /* This is the root partitioned rel. */ + painfo = linitial(rel->live_partition_painfos); + append_rel_children = painfo->live_partition_relids; + } + else + { + RelOptInfo *rootrel; + + /* Non-root partitioned table. Get it from the root rel. */ + rootrel = root->simple_rel_array[rel->root_parent_relid]; + foreach(l, rootrel->live_partition_painfos) + { + PartitionAppendInfo *painfo = lfirst(l); + + if (rti == painfo->parent_relid) + { + append_rel_children = painfo->live_partition_relids; + break; + } + } + } + } /* * Generate access paths for each member relation, and remember the * non-dummy children. */ - foreach(l, root->append_rel_list) + foreach(l, append_rel_children) { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); - int childRTindex; + int childRTindex = lfirst_int(l); RangeTblEntry *childRTE; RelOptInfo *childrel; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - /* Re-locate the child RTE and RelOptInfo */ - childRTindex = appinfo->child_relid; childRTE = root->simple_rte_array[childRTindex]; childrel = root->simple_rel_array[childRTindex]; @@ -1289,7 +1613,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte; rte = planner_rt_fetch(rel->relid, root); - if (rte->relkind == RELKIND_PARTITIONED_TABLE) + /* Note that only a root partitioned table would have inh flag set. */ + if (rte->relkind == RELKIND_PARTITIONED_TABLE && rte->inh) { partitioned_rels = get_partitioned_child_rels(root, rel->relid); /* The root partitioned table is included as a child rel */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index fdef00ab39..09dd32de79 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -514,7 +514,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, root->multiexpr_params = NIL; root->eq_classes = NIL; root->append_rel_list = NIL; - root->pcinfo_list = NIL; + root->prinfo_list = NIL; root->rowMarks = NIL; memset(root->upper_rels, 0, sizeof(root->upper_rels)); memset(root->upper_targets, 0, sizeof(root->upper_targets)); @@ -1050,6 +1050,93 @@ inheritance_planner(PlannerInfo *root) Index rti; RangeTblEntry *parent_rte; List *partitioned_rels = NIL; + List *rel_appinfos = NIL; + ListCell *l; + + parent_rte = rt_fetch(parentRTindex, root->parse->rtable); + if (parent_rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + { + PartitionRootInfo *prinfo = NULL; + Relation parent; + List *parent_vars = build_rel_vars(parent_rte, parentRTindex); + + /* Find the PartitionedChildRelInfo for this rel */ + foreach(l, root->prinfo_list) + { + prinfo = lfirst(l); + + if (prinfo->parent_relid == parentRTindex) + break; + } + Assert(prinfo != NULL && prinfo->parent_relid == parentRTindex); + + parent = heap_open(parent_rte->relid, NoLock); + foreach(l, prinfo->leaf_part_infos) + { + LeafPartitionInfo *lpinfo = lfirst(l); + Index childRTindex = lpinfo->relid; + RangeTblEntry *childrte = planner_rt_fetch(childRTindex, root); + Relation child; + AppendRelInfo *appinfo; + + if (childrte->relkind == RELKIND_PARTITIONED_TABLE) + continue; + + /* + * We'll need RowExclusiveLock, because just like the parent, each + * child is a result relation. + */ + child = heap_open(childrte->relid, RowExclusiveLock); + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = parentRTindex; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = parent->rd_rel->reltype; + appinfo->child_reltype = child->rd_rel->reltype; + appinfo->translated_vars = map_partition_varattnos(parent_vars, + parentRTindex, + child, parent, + NULL); + ChangeVarNodes((Node *) appinfo->translated_vars, + parentRTindex, childRTindex, 0); + appinfo->parent_reloid = RelationGetRelid(parent); + rel_appinfos = lappend(rel_appinfos, appinfo); + root->append_rel_list = lappend(root->append_rel_list, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + childrte->selectedCols = + translate_col_privs(parent_rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = + translate_col_privs(parent_rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = + translate_col_privs(parent_rte->updatedCols, + appinfo->translated_vars); + heap_close(child, NoLock); + } + heap_close(parent, NoLock); + } Assert(parse->commandType != CMD_INSERT); @@ -1115,14 +1202,13 @@ inheritance_planner(PlannerInfo *root) * opposite in the case of non-partitioned inheritance parent as described * below. */ - parent_rte = rt_fetch(parentRTindex, root->parse->rtable); if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) nominalRelation = parentRTindex; /* * And now we can get on with generating a plan for each child table. */ - foreach(lc, root->append_rel_list) + foreach(lc, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); PlannerInfo *subroot; @@ -1130,10 +1216,6 @@ inheritance_planner(PlannerInfo *root) RelOptInfo *sub_final_rel; Path *subpath; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - /* * We need a working copy of the PlannerInfo so that we can control * propagation of information back to the main copy. @@ -6070,7 +6152,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * Returns a list of the RT indexes of the partitioned child relations * with rti as the root parent RT index. * - * Note: Only call this function on RTEs known to be partitioned tables. + * Note: Only call this function on RTEs known to be a root partitioned table. */ List * get_partitioned_child_rels(PlannerInfo *root, Index rti) @@ -6078,13 +6160,13 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) List *result = NIL; ListCell *l; - foreach(l, root->pcinfo_list) + foreach(l, root->prinfo_list) { - PartitionedChildRelInfo *pc = lfirst(l); + PartitionRootInfo *prinfo = lfirst(l); - if (pc->parent_relid == rti) + if (prinfo->parent_relid == rti) { - result = pc->child_rels; + result = prinfo->partitioned_relids; break; } } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index ee2e066263..4b4d95eb63 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -105,8 +105,6 @@ static void make_inh_translation_list(Relation oldrelation, Relation newrelation, Index newvarno, List **translated_vars); -static Bitmapset *translate_col_privs(const Bitmapset *parent_privs, - List *translated_vars); static Node *adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context); static Relids adjust_child_relids(Relids relids, int nappinfos, @@ -1352,11 +1350,19 @@ expand_inherited_tables(PlannerInfo *root) /* * expand_inherited_rtentry - * Check whether a rangetable entry represents an inheritance set. - * If so, add entries for all the child tables to the query's - * rangetable, and build AppendRelInfo nodes for all the child tables - * and add them to root->append_rel_list. If not, clear the entry's - * "inh" flag to prevent later code from looking for AppendRelInfos. + * Perform actions necessary for applying this query to an inheritance + * set if the rte represents one + * + * That includes adding entries for all the child tables to the query's + * rangetable. Also, if this query requires a PlanRowMark, generate the same + * for each child table and append them to the planner's global list + * (root->rowMarks). If the inheritance set is really a partitioned table, + * our work here is done. If not, we also create AppendRelInfo nodes for + * all the child tables and add them to root->append_rel_list. + * + * If it turns out that the rte is not (or no longer) an inheritance set, + * clear the entry's "inh" flag to prevent later code from looking for + * AppendRelInfos. * * Note that the original RTE is considered to represent the whole * inheritance set. The first of the generated RTEs is an RTE for the same @@ -1381,9 +1387,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) List *inhOIDs; List *appinfos; ListCell *l; - bool has_child; - PartitionedChildRelInfo *pcinfo; List *partitioned_child_rels = NIL; + List *partition_infos = NIL; + List *leaf_part_infos = NIL; + List *orig_leaf_part_oids; + int num_partitioned_children; + PartitionedTableInfo *ptinfo; + PartitionInfo *pinfo; /* Does RT entry allow inheritance? */ if (!rte->inh) @@ -1408,6 +1418,11 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) * relation named in the query. However, for each child relation we add * to the query, we must obtain an appropriate lock, because this will be * the first use of those relations in the parse/rewrite/plan pipeline. + * For a partitioned table, we defer locking non-partitioned child tables + * to when we actually know that it will be scanned (see below that we + * use RelationGetPartitionDispatchInfo() to get the list of child tables + * of partitioned tables, not find_all_inheritors() which would lock the + * child tables.) * * If the parent relation is the query's result relation, then we need * RowExclusiveLock. Otherwise, if it's accessed FOR UPDATE/SHARE, we @@ -1425,7 +1440,8 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) lockmode = AccessShareLock; /* Scan for all members of inheritance set, acquire needed locks */ - inhOIDs = find_all_inheritors(parentOID, lockmode, false, NULL, NULL); + inhOIDs = find_all_inheritors(parentOID, lockmode, true, NULL, + &num_partitioned_children); /* * Check that there's at least one descendant, else treat as no-child @@ -1461,9 +1477,17 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) { List *leaf_part_oids, *ptinfos; + int rtable_length = list_length(parse->rtable), + i; + + /* + * Keep leaf partition OIDs around so that we can lock them in this + * order when we eventually do it. + */ + orig_leaf_part_oids = list_copy_tail(inhOIDs, + num_partitioned_children + 1); - /* Discard the original list. */ - list_free(inhOIDs); + /* Discard the original inhOIDs list. */ inhOIDs = NIL; /* Request partitioning information. */ @@ -1471,14 +1495,37 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) &leaf_part_oids); /* - * First collect the partitioned child table OIDs, which includes the - * root parent at the head. + * We make a PartitionInfo object for every partitioned table in the + * tree, including the root table. We create the root table's + * PartitionInfo outside the loop, because we'd like to use its + * original RT index, whereas for the child partitioned tables, we'll + * use their to-be RT indexes. */ + ptinfo = linitial(ptinfos); + pinfo = makeNode(PartitionInfo); + pinfo->relid = rti; + pinfo->pd = ptinfo->pd; + partition_infos = list_make1(pinfo); + + /* Let there remain only the child tables' PartitionedTableInfo's */ + ptinfos = list_delete_first(ptinfos); + + /* + * First collect the partitioned child table OIDs. Note that the list + * won't contain the root table's OID because we removed its ptinfo + * from the list above. + */ + i = 1; foreach(l, ptinfos) { PartitionedTableInfo *ptinfo = lfirst(l); + PartitionInfo *pinfo = makeNode(PartitionInfo); inhOIDs = lappend_oid(inhOIDs, ptinfo->relid); + pinfo->relid = rtable_length + i; + pinfo->pd = ptinfo->pd; + partition_infos = lappend(partition_infos, pinfo); + i++; } /* Concatenate the leaf partition OIDs. */ @@ -1487,7 +1534,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* Scan the inheritance set and expand it */ appinfos = NIL; - has_child = false; foreach(l, inhOIDs) { Oid childOID = lfirst_oid(l); @@ -1496,23 +1542,14 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) Index childRTindex; AppendRelInfo *appinfo; - /* Open rel if needed; we already have required locks */ - if (childOID != parentOID) - newrelation = heap_open(childOID, NoLock); - else - newrelation = oldrelation; - /* * It is possible that the parent table has children that are temp * tables of other backends. We cannot safely access such tables * (because of buffering issues), and the best thing to do seems to be * to silently ignore them. */ - if (childOID != parentOID && RELATION_IS_OTHER_TEMP(newrelation)) - { - heap_close(newrelation, lockmode); + if (childOID != parentOID && rel_is_other_temp(childOID)) continue; - } /* * Build an RTE for the child, and attach to query's rangetable list. @@ -1528,7 +1565,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) */ childrte = copyObject(rte); childrte->relid = childOID; - childrte->relkind = newrelation->rd_rel->relkind; + childrte->relkind = get_rel_relkind(childOID); childrte->inh = false; childrte->requiredPerms = 0; childrte->securityQuals = NIL; @@ -1536,51 +1573,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) childRTindex = list_length(parse->rtable); /* - * Build an AppendRelInfo for this parent and child, unless the child - * is a partitioned table. - */ - if (childrte->relkind != RELKIND_PARTITIONED_TABLE) - { - /* Remember if we saw a real child. */ - if (childOID != parentOID) - has_child = true; - - appinfo = makeNode(AppendRelInfo); - appinfo->parent_relid = rti; - appinfo->child_relid = childRTindex; - appinfo->parent_reltype = oldrelation->rd_rel->reltype; - appinfo->child_reltype = newrelation->rd_rel->reltype; - make_inh_translation_list(oldrelation, newrelation, childRTindex, - &appinfo->translated_vars); - appinfo->parent_reloid = parentOID; - appinfos = lappend(appinfos, appinfo); - - /* - * Translate the column permissions bitmaps to the child's attnums - * (we have to build the translated_vars list before we can do - * this). But if this is the parent table, leave copyObject's - * result alone. - * - * Note: we need to do this even though the executor won't run any - * permissions checks on the child RTE. The - * insertedCols/updatedCols bitmaps may be examined for - * trigger-firing purposes. - */ - if (childOID != parentOID) - { - childrte->selectedCols = translate_col_privs(rte->selectedCols, - appinfo->translated_vars); - childrte->insertedCols = translate_col_privs(rte->insertedCols, - appinfo->translated_vars); - childrte->updatedCols = translate_col_privs(rte->updatedCols, - appinfo->translated_vars); - } - } - else - partitioned_child_rels = lappend_int(partitioned_child_rels, - childRTindex); - - /* * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE. */ if (oldrc) @@ -1604,12 +1596,78 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) */ newrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in parent's allMarkTypes */ - oldrc->allMarkTypes |= newrc->allMarkTypes; root->rowMarks = lappend(root->rowMarks, newrc); } + /* + * No need to create AppendRelInfo for partitions at this point, + * because we don't know yet if it will actually be scanned by this + * query. The fact that this is a partition of the parent table + * will be recorded in the PartitionInfo created for the parent + * table. + */ + if (rel_is_partition(childOID) && + childrte->relkind != RELKIND_PARTITIONED_TABLE) + { + LeafPartitionInfo *lpinfo = makeNode(LeafPartitionInfo); + + lpinfo->reloid = childOID; + lpinfo->relid = childRTindex; + leaf_part_infos = lappend(leaf_part_infos, lpinfo); + continue; + } + + if (childrte->relkind == RELKIND_PARTITIONED_TABLE) + { + partitioned_child_rels = lappend_int(partitioned_child_rels, + childRTindex); + continue; + } + + /* + * This must be a non-partitioned child table that is not a partition. + * Build an AppendRelInfo for the same to remember the parent-child + * relationship. + */ + + /* Open rel if needed, we already have required locks */ + if (childOID != parentOID) + newrelation = heap_open(childOID, NoLock); + else + newrelation = oldrelation; + + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = rti; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = oldrelation->rd_rel->reltype; + appinfo->child_reltype = newrelation->rd_rel->reltype; + make_inh_translation_list(oldrelation, newrelation, childRTindex, + &appinfo->translated_vars); + appinfo->parent_reloid = parentOID; + appinfos = lappend(appinfos, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + if (childOID != parentOID) + { + childrte->selectedCols = translate_col_privs(rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = translate_col_privs(rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = translate_col_privs(rte->updatedCols, + appinfo->translated_vars); + } + /* Close child relations, but keep locks */ if (childOID != parentOID) heap_close(newrelation, NoLock); @@ -1618,35 +1676,53 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) heap_close(oldrelation, NoLock); /* - * If all the children were temp tables or a partitioned parent did not - * have any leaf partitions, pretend it's a non-inheritance situation; we - * don't need Append node in that case. The duplicate RTE we added for - * the parent table is harmless, so we don't bother to get rid of it; - * ditto for the useless PlanRowMark node. + * We keep a list of objects in root, each of which maps a partitioned + * parent RT index to a bunch of information about the partition tree + * rooted at that parent. The information includes a list of RT indexes + * of partitioned tables appearing in the tree, a list of PartitionInfo + * objects for each such partitioned table, a list of LeafPartitionInfo + * objects for each leaf partition in tree, and finally a list containing + * leaf partition OIDs in an order in which find_all_inheritors() returned + * them. The first of these is used when creating an Append or a + * ModifyTable path for the parent to be copied verbatim into the path + * (and subsequently the plan) so that it could be carried over to the + * executor. That list is the only place where the executor could find + * partitioned child tables to lock them. */ - if (!has_child) + if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - /* Clear flag before returning */ - rte->inh = false; + PartitionRootInfo *prinfo = makeNode(PartitionRootInfo); + + Assert(list_length(partition_infos) >= 1); + prinfo->parent_relid = rti; + /* + * Be sure to include the parent's RT index, because the above code + * didn't. + */ + prinfo->partitioned_relids = lcons_int(rti, partitioned_child_rels); + prinfo->partition_infos = partition_infos; + prinfo->leaf_part_infos = leaf_part_infos; + prinfo->orig_leaf_part_oids = orig_leaf_part_oids; + + root->prinfo_list = lappend(root->prinfo_list, prinfo); + + /* + * Our job here is done, because we didn't create any AppendRelInfos. + */ return; } /* - * We keep a list of objects in root, each of which maps a partitioned - * parent RT index to the list of RT indexes of its partitioned child - * tables. When creating an Append or a ModifyTable path for the parent, - * we copy the child RT index list verbatim to the path so that it could - * be carried over to the executor so that the latter could identify the - * partitioned child tables. + * If all the children were temp tables, pretend it's a non-inheritance + * situation; we don't need Append node in that case. The duplicate + * RTE we added for the parent table is harmless, so we don't bother to + * get rid of it; ditto for the useless PlanRowMark node. */ - if (partitioned_child_rels != NIL) + if (list_length(appinfos) < 2) { - pcinfo = makeNode(PartitionedChildRelInfo); - - Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); - pcinfo->parent_relid = rti; - pcinfo->child_rels = partitioned_child_rels; - root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); + /* Clear flag before returning */ + rte->inh = false; + return; } /* Otherwise, OK to add to root->append_rel_list */ @@ -1767,7 +1843,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, * query is really only going to reference the inherited columns. Instead * we set the per-column bits for all inherited columns. */ -static Bitmapset * +Bitmapset * translate_col_privs(const Bitmapset *parent_privs, List *translated_vars) { diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index a1ebd4acc8..5607a4e4e0 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1577,6 +1577,50 @@ build_physical_tlist(PlannerInfo *root, RelOptInfo *rel) } /* + * build_rel_vars + * + * Returns a list containing Var expressions corresponding to a relation's + * attributes. Since the caller may already have the RangeTblEntry, we it + * pass the same instead of PlannerInfo to avoid finding it in the range + * table all over again. + */ +List * +build_rel_vars(RangeTblEntry *rte, Index relid) +{ + Relation relation; + AttrNumber attrno; + int numattrs; + List *result = NIL; + + Assert(rte->rtekind == RTE_RELATION); + + /* Assume we already have adequate lock */ + relation = heap_open(rte->relid, NoLock); + + numattrs = RelationGetNumberOfAttributes(relation); + for (attrno = 1; attrno <= numattrs; attrno++) + { + Form_pg_attribute att_tup = TupleDescAttr(relation->rd_att, + attrno - 1); + + if (att_tup->attisdropped) + continue; + + result = lappend(result, + makeVar(relid, + attrno, + att_tup->atttypid, + att_tup->atttypmod, + att_tup->attcollation, + 0)); + + } + + heap_close(relation, NoLock); + return result; +} + +/* * build_index_tlist * * Build a targetlist representing the columns of the specified index. diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 8ad0b4a669..4cc32dea8d 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -16,7 +16,9 @@ #include +#include "catalog/pg_class.h" #include "miscadmin.h" +#include "nodes/relation.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" @@ -146,6 +148,15 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->baserestrict_min_security = UINT_MAX; rel->joininfo = NIL; rel->has_eclass_joins = false; + /* Set in build_simple_rel if rel is root partitioned table */ + rel->num_parted = 0; + rel->partition_infos = NULL; + rel->num_leaf_parts = 0; + rel->leaf_part_infos = NULL; + /* Set in get_rel_partitions_recurse */ + rel->live_partition_painfos = NIL; + /* Set in set_append_rel_size if rel is a partition. */ + rel->root_parent_relid = 0; /* * Pass top parent's relids down the inheritance hierarchy. If the parent @@ -210,25 +221,73 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) list_length(rte->securityQuals)); /* - * If this rel is an appendrel parent, recurse to build "other rel" - * RelOptInfos for its children. They are "other rels" because they are - * not in the main join tree, but we will need RelOptInfos to plan access - * to them. + * If this rel is an appendrel parent, generate additional information + * based on whether the parent is a partitioned table or not. For + * regular parent tables, recurse to build "other rel" RelOptInfos for its + * children. They are "other rels" because they are not in the main join + * tree, but we will need RelOptInfos to plan access to them. For + * partitioned parent tables, we do not yet create "other rel" RelOptInfos + * for the children. Instead, we set up some informations that will be + * used in set_append_rel_size() to look up its partitions. */ if (rte->inh) { ListCell *l; - foreach(l, root->append_rel_list) + if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + PartitionRootInfo *prinfo = NULL; + LeafPartitionInfo **lpinfos; + int i; + + foreach(l, root->prinfo_list) + { + prinfo = lfirst(l); + if (prinfo->parent_relid == relid) + break; + } + Assert(prinfo != NULL && prinfo->parent_relid == relid); + + rel->num_parted = list_length(prinfo->partition_infos); + rel->num_leaf_parts = list_length(prinfo->leaf_part_infos); + rel->partition_infos = (PartitionInfo **) + palloc0(rel->num_parted * + sizeof(PartitionInfo *)); + lpinfos = (LeafPartitionInfo **) palloc0(rel->num_leaf_parts * + sizeof(LeafPartitionInfo *)); + i = 0; + foreach(l, prinfo->partition_infos) + { + rel->partition_infos[i++] = lfirst(l); + } + i = 0; + foreach(l, prinfo->leaf_part_infos) + { + lpinfos[i++] = lfirst(l); + } + rel->leaf_part_infos = lpinfos; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != relid) - continue; + /* + * Don't build RelOptInfo for partitions yet; we don't know which + * ones we'll need. We did create RangeTblEntry's though, so we + * have an empty slot in root->simple_rel_array that will be + * filled eventually if the respective partition is chosen to be + * scanned after all. + */ + } + else + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid != relid) + continue; - (void) build_simple_rel(root, appinfo->child_relid, - rel); + (void) build_simple_rel(root, appinfo->child_relid, + rel); + } } } diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 82763f8013..ebbc3da985 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -1817,6 +1817,28 @@ get_rel_relkind(Oid relid) } /* + * rel_is_partition + * + * Returns the relkind associated with a given relation. + */ +char +rel_is_partition(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + bool result; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + result = reltup->relispartition; + ReleaseSysCache(tp); + + return result; +} + +/* * get_rel_tablespace * * Returns the pg_tablespace OID associated with a given relation. @@ -1865,6 +1887,34 @@ get_rel_persistence(Oid relid) return result; } +/* + * rel_is_other_temp + * + * Returns whether a relation is a temp table from another session + */ +bool +rel_is_other_temp(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + bool result = false; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + + if (reltup->relpersistence == RELPERSISTENCE_TEMP && + !isTempOrTempToastNamespace(reltup->relnamespace)) + { + result = true; + } + + ReleaseSysCache(tp); + + return result; +} + /* ---------- TRANSFORM CACHE ---------- */ diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 7b53baf847..b5dcb22688 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -16,6 +16,7 @@ #include "fmgr.h" #include "executor/tuptable.h" #include "nodes/execnodes.h" +#include "nodes/relation.h" #include "parser/parse_node.h" #include "utils/rel.h" @@ -87,4 +88,7 @@ extern int get_partition_for_tuple(PartitionTupleRoutingInfo **ptrinfos, EState *estate, PartitionTupleRoutingInfo **failed_at, TupleTableSlot **failed_slot); + +/* Planner support stuff. */ +extern List *get_partitions_for_keys(PartitionDispatch pd); #endif /* PARTITION_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 27bd4f3363..e957615ac6 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -260,7 +260,10 @@ typedef enum NodeTag T_PlaceHolderVar, T_SpecialJoinInfo, T_AppendRelInfo, - T_PartitionedChildRelInfo, + T_PartitionInfo, + T_LeafPartitionInfo, + T_PartitionAppendInfo, + T_PartitionRootInfo, T_PlaceHolderInfo, T_MinMaxAggInfo, T_PlannerParamItem, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 3ccc9d1b03..71c494a7c2 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -251,7 +251,7 @@ typedef struct PlannerInfo List *append_rel_list; /* list of AppendRelInfos */ - List *pcinfo_list; /* list of PartitionedChildRelInfos */ + List *prinfo_list; /* list of PartitionRootInfos */ List *rowMarks; /* list of PlanRowMarks */ @@ -515,6 +515,9 @@ typedef enum RelOptKind /* Is the given relation an "other" relation? */ #define IS_OTHER_REL(rel) ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL) +typedef struct PartitionInfo PartitionInfo; +typedef struct LeafPartitionInfo LeafPartitionInfo; + typedef struct RelOptInfo { NodeTag type; @@ -592,6 +595,23 @@ typedef struct RelOptInfo /* used by "other" relations */ Relids top_parent_relids; /* Relids of topmost parents */ + + /* Fields set for "root" partitioned relations */ + int num_parted; /* Number of entries in partition_infos */ + PartitionInfo **partition_infos; + int num_leaf_parts; /* Number of entries in leaf_part_infos */ + LeafPartitionInfo **leaf_part_infos; /* LeafPartitionInfos */ + + /* Fields set for partitioned relations (list of PartitionAppendInfo's) */ + List *live_partition_painfos; + + /* Fields set for partition otherrels */ + + /* + * RT index of the root partitioned table in the the partition tree of + * which this rel is a member. + */ + Index root_parent_relid; } RelOptInfo; /* @@ -2012,24 +2032,73 @@ typedef struct AppendRelInfo Oid parent_reloid; /* OID of parent relation */ } AppendRelInfo; +/* Forward declarations, to avoid including other headers */ +typedef struct PartitionDispatchData *PartitionDispatch; + +/* + * PartitionInfo - information about partitioning of one partitioned table in + * a given partition tree + */ +typedef struct PartitionInfo +{ + NodeTag type; + + Index relid; /* Ordinal position in the rangetable */ + PartitionDispatch pd; /* Information about partitions */ +} PartitionInfo; + +/* + * LeafPartitionInfo - (OID, RT index) pair for one leaf partition + * + * Created when a leaf partition's RT entry is created in + * expand_inherited_rtentry(). + */ +typedef struct LeafPartitionInfo +{ + NodeTag type; + + Oid reloid; /* OID */ + Index relid; /* RT index */ +} LeafPartitionInfo; + /* - * For a partitioned table, this maps its RT index to the list of RT indexes - * of the partitioned child tables in the partition tree. We need to - * separately store this information, because we do not create AppendRelInfos - * for the partitioned child tables of a parent table, since AppendRelInfos - * contain information that is unnecessary for the partitioned child tables. - * The child_rels list must contain at least one element, because the parent - * partitioned table is itself counted as a child. + * PartitionAppendInfo - list of child RT indexes for one partitioned table + * in a given partition tree + */ +typedef struct PartitionAppendInfo +{ + NodeTag type; + + Index parent_relid; + List *live_partition_relids; /* List of RT indexes */ +} PartitionAppendInfo; + +/* + * For a partitioned table, this maps its RT index to the information about + * the partition tree collected in expand_inherited_rtentry(). + * + * That information includes a list of PartitionInfo nodes, one for each + * partitioned table in the partition tree, including for the table itself. + * Also included is a list of RT indexes of the entries for leaf partitions + * that are created at the same time by expand_inherited_rtentry(). + * + * orig_leaf_part_oids contains the list of leaf partition OIDs as it was + * generated by find_all_inheritors(). We keep it around so that we can + * lock leaf partitions in that order when we actually do it. * - * These structs are kept in the PlannerInfo node's pcinfo_list. + * PartitionRootInfo's for different partitioned tables in a query are placed + * in root->prinfo_list. */ -typedef struct PartitionedChildRelInfo +typedef struct PartitionRootInfo { NodeTag type; Index parent_relid; - List *child_rels; -} PartitionedChildRelInfo; + List *partition_infos; + List *partitioned_relids; + List *leaf_part_infos; + List *orig_leaf_part_oids; +} PartitionRootInfo; /* * For each distinct placeholder expression generated during planning, we diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 71f0faf938..1e18f609b1 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -39,6 +39,7 @@ extern bool relation_excluded_by_constraints(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); extern List *build_physical_tlist(PlannerInfo *root, RelOptInfo *rel); +extern List *build_rel_vars(RangeTblEntry *rte, Index relid); extern bool has_unique_index(RelOptInfo *rel, AttrNumber attno); diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index 4be0afd566..d0af8dc7bc 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -16,6 +16,7 @@ #include "nodes/plannodes.h" #include "nodes/relation.h" +#include "utils/rel.h" /* @@ -51,6 +52,8 @@ extern PlanRowMark *get_plan_rowmark(List *rowmarks, Index rtindex); extern RelOptInfo *plan_set_operations(PlannerInfo *root); extern void expand_inherited_tables(PlannerInfo *root); +extern Bitmapset *translate_col_privs(const Bitmapset *parent_privs, + List *translated_vars); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, AppendRelInfo **appinfos); diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 07208b56ce..b5b615a6fa 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -126,8 +126,10 @@ extern char *get_rel_name(Oid relid); extern Oid get_rel_namespace(Oid relid); extern Oid get_rel_type_id(Oid relid); extern char get_rel_relkind(Oid relid); +extern bool rel_is_partition(Oid relid); extern Oid get_rel_tablespace(Oid relid); extern char get_rel_persistence(Oid relid); +extern bool rel_is_other_temp(Oid relid); extern Oid get_transform_fromsql(Oid typid, Oid langid, List *trftypes); extern Oid get_transform_tosql(Oid typid, Oid langid, List *trftypes); extern bool get_typisdefined(Oid typid); diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index a2d9469592..e159d62b66 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -278,12 +278,12 @@ select tableoid::regclass, * from list_parted; -------------+----+---- part_aa_bb | aA | part_cc_dd | cC | 1 - part_null | | 0 - part_null | | 1 part_ee_ff1 | ff | 1 part_ee_ff1 | EE | 1 part_ee_ff2 | ff | 11 part_ee_ff2 | EE | 10 + part_null | | 0 + part_null | | 1 (8 rows) -- some more tests to exercise tuple-routing with multi-level partitioning -- 2.11.0