Skip site navigation (1) Skip section navigation (2)

Peripheral Links

Header And Logo

PostgreSQL
| The world's most advanced open source database.

Site Navigation

Search archives
  Advanced Search

Re: gsoc, oprrest function for text search take 2


  • From: Jan Urbański <j(dot)urbanski(at)students(dot)mimuw(dot)edu(dot)pl>
  • To: Heikki Linnakangas <heikki(at)enterprisedb(dot)com>
  • Cc: Postgres - Hackers <pgsql-hackers(at)postgresql(dot)org>
  • Subject: Re: gsoc, oprrest function for text search take 2
  • Date: Wed, 30 Jul 2008 00:17:09 +0200
  • Message-id: <488F96E5.4020308@students.mimuw.edu.pl> <text/plain>

Heikki Linnakangas wrote:
Jan Urbański wrote:
Another thing are cstring_to_text_with_len calls. I'm doing them so I can use bttextcmp in bsearch(). I think I could come up with a dedicated function to return text Datums and WordEntries (read: non-NULL terminated strings with a given length).

Just keep them as cstrings and use strcmp. We're only keeping the array sorted so that we can binary search it, so we don't need proper locale-dependent collation. Note that we already assume that two strings ('text's) are equal if and only if their binary representations are equal (texteq() uses strcmp).

OK, I got rid of cstring->text calls and memory contexts as I went through it. The only tiny ugliness is that there's one function used for qsort() and another for bsearch(), because I'm sorting an array of texts (from pg_statistic) and I'm binary searching for a lexeme (non-NULL terminated string with length).
My medicore gprof skills got me:
                0.00    0.22       5/5           OidFunctionCall4 [37]
[38]    28.4    0.00    0.22       5         tssel [38]
0.00 0.17 5/5 get_restriction_variable [40]
                0.03    0.01       5/10          pg_qsort [60]
                0.00    0.00       5/5           get_attstatsslot [139]

Hopefully that says that the qsort() overhead is small compared to munging through the planner Node.
Revised version of the patch attached.

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 ****
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
  	dict_simple.o dict_synonym.o dict_thesaurus.o \
  	dict_ispell.o regis.o spell.o \
! 	to_tsany.o ts_typanalyze.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
  	dict_simple.o dict_synonym.o dict_thesaurus.o \
  	dict_ispell.o regis.o spell.o \
! 	to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4deb507
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0,-1 ****
--- 1,334 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_selfuncs.c
+  *	  Selectivity functions for text search types.
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  *
+  * IDENTIFICATION
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+ 
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+ 	Datum	element;
+ 	float4	frequency;
+ } TextFreq;
+ 
+ /* type of keys for bsearch()ing through an array of TextFreqs  */
+ typedef struct
+ {
+ 	char	*lexeme;
+ 	int		length;
+ } LexemeKey;
+ 
+ static int
+ compare_two_textfreqs(const void *e1, const void *e2);
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2);
+ 
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ 				  int length, float4 minfreq);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ 						   float4 *numbers, int nnumbers);
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+ 
+ 
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ 				  int length, float4 minfreq)
+ {
+ 	LexemeKey	key;
+ 	TextFreq	*searchres;
+ 	Selectivity	s1, s2;
+ 
+ 	/* since this function recurses, it could be driven to stack overflow */
+ 	check_stack_depth();
+ 
+ 	if (item->type == QI_VAL)
+ 	{
+ 		QueryOperand *oper = (QueryOperand *) item;
+ 
+ 		/*
+ 		 * Prepare the key for bsearch().
+ 		 */
+ 		key.lexeme = operand + oper->distance;
+ 		key.length = oper->length;
+ 
+ 		searchres = (TextFreq *) bsearch(&key, lookup, length,
+ 										 sizeof(TextFreq), compare_lexeme_textfreq);
+ 		if (searchres)
+ 		{
+ 			/*
+ 			 * The element is in MCELEM. Return precise selectivity (or at
+ 			 * least as precise, as ANALYZE could find out).
+ 			 */
+ 			return (Selectivity) searchres->frequency;
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * The element is not in MCELEM. Punt, but  assure that the
+ 			 * selectivity cannot be more than minfreq / 2.
+ 			 */
+ 			return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+ 		}
+ 	}
+ 
+ 	/* Current TSQuery node is an operator */
+ 	switch (item->operator.oper)
+ 	{
+ 		case OP_NOT:
+ 			return 1.0 - tsquery_opr_selec(item + 1, operand, lookup,
+ 										   length, minfreq);
+ 
+ 		case OP_AND:
+ 			return
+ 				tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) *
+ 				tsquery_opr_selec(item + item->operator.left, operand, lookup,
+ 								  length, minfreq);
+ 
+ 		case OP_OR:
+ 			s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq);
+ 			s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup,
+ 								   length, minfreq);
+ 			return s1 + s2 - s1 * s2;
+ 
+ 		default:
+ 			elog(ERROR, "unrecognized operator: %d", item->operator.oper);
+ 	}
+ 
+ 	/* never reached, keep compiler happy */
+ 	return (Selectivity) DEFAULT_TS_SEL;
+ }
+ 
+ /*
+  * Traverse the tsquery preorder, calculating selectivity as:
+  *
+  *   selec(left_oper) * selec(right_oper) in AND nodes,
+  *
+  *   selec(left_oper) + selec(right_oper) -
+  *      selec(left_oper) * selec(right_oper) in OR nodes,
+  *
+  *   1 - select(oper) in NOT nodes
+  *
+  *   freq[val] in VAL nodes, if the value is in MCELEM
+  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+  *
+  *
+  * Implementation-wise, we sort the MCELEM array to use binary
+  * search on it.
+  */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ 						   float4 *numbers, int nnumbers)
+ {
+ 	float4			minfreq;
+ 	TextFreq		*lookup;
+ 	Selectivity		selec;
+ 	int				i;
+ 
+ 	/* Grab the lowest frequency */
+ 	minfreq = numbers[nnumbers - 1];
+ 
+ 	/* Construct the array to be qsort()'ed */
+ 	Assert(nmcelem == nnumbers);
+ 	lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+ 	for (i = 0; i < nmcelem; i++)
+ 	{
+ 		lookup[i].element = mcelem[i];
+ 		lookup[i].frequency = numbers[i];
+ 	}
+ 
+ 	qsort(lookup, nmcelem, sizeof(TextFreq), compare_two_textfreqs);
+ 
+ 	selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+ 							  nmcelem, minfreq);
+ 
+ 	pfree(lookup);
+ 
+ 	return selec;
+ }
+ 
+ /*
+  * qsort() comparator for TextFreqs array. Sorts on the text datum.
+  * The comparision is byte-by-byte, because we already consider text datums
+  * equal only if their binary representations are equal and we don't really
+  * care about the result ordering, as long as it's the same as the one
+  * assumed by the bsearch() comparator.
+  */
+ static int
+ compare_two_textfreqs(const void *e1, const void *e2)
+ {
+ 	const TextFreq *t1 = (const TextFreq *) e1;
+ 	const TextFreq *t2 = (const TextFreq *) e2;
+ 
+ 	return DatumGetInt32(DirectFunctionCall2(bttext_pattern_cmp,
+ 											 t1->element, t2->element));
+ }
+ 
+ /*
+  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+  * and a TextFreq. Use byte-for-byte comparision, because that's the one we
+  * qsort()ed on.
+  */
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2)
+ {
+ 	const LexemeKey	*key;
+ 	const TextFreq	*t;
+ 	text			*element;
+ 	int				result;
+ 	int				len1,
+ 					len2;
+ 
+ 	key = (const LexemeKey *) e1;
+ 	t = (const TextFreq *) e2;
+ 	element = DatumGetTextP(t->element);
+ 
+ 	len1 = key->length;
+ 	len2 = VARSIZE_ANY_EXHDR(element);
+ 
+ 	result = strncmp(key->lexeme, VARDATA_ANY(element), Min(len1, len2));
+ 
+ 	if (result != 0)
+ 		return result;
+ 	else if (len1 < len2)
+ 		return -1;
+ 	else if (len1 > len2)
+ 		return 1;
+ 	else
+ 		return 0;
+ }
+ 
+ /*
+  *	tssel -- Selectivity of "@@"
+  */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+ 	PlannerInfo		*root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ 	/* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+ 	List			*args = (List *) PG_GETARG_POINTER(2);
+ 	int				varRelid = PG_GETARG_INT32(3);
+ 	VariableStatData vardata;
+ 	Node			*other;
+ 	bool			varonleft;
+ 	Selectivity		selec;
+ 
+ 	/*
+ 	 * If expression is not variable op something or something op variable,
+ 	 * then punt and return a default estimate.
+ 	 */
+ 	if (!get_restriction_variable(root, args, varRelid,
+ 								  &vardata, &other, &varonleft))
+ 	{
+ 		PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ 	}
+ 
+ 	/*
+ 	 * Can't do anything useful if the something is not a constant, either.
+ 	 */
+ 	if (!IsA(other, Const))
+ 	{
+ 		ReleaseVariableStats(vardata);
+ 		PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ 	}
+ 
+ 	/* The "@@" operator is strict, so might cope with NULL right away */
+ 	if (((Const *) other)->constisnull) {
+ 		ReleaseVariableStats(vardata);
+ 		PG_RETURN_FLOAT8((float8) 0.0);
+ 	}
+ 
+ 	/*
+ 	 * OK, there's a Var and a Const we're dealing with here. We need the Var
+ 	 * to be a TSVector (or else we don't have any useful statistic for it).
+ 	 */
+ 
+ 	if (vardata.vartype == TSVECTOROID)
+ 	{
+ 		/* tsvector @@ tsquery or the other way around */
+ 		Assert(((Const *) other)->consttype == TSQUERYOID);
+ 
+ 		selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+ 	}
+ 	else
+ 	{
+ 		/* The Var is something we don't have useful statistic for */
+ 		selec = DEFAULT_TS_SEL;
+ 	}
+ 
+ 	ReleaseVariableStats(vardata);
+ 
+ 	PG_RETURN_FLOAT8((float8) selec);
+ }
+ 
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+ 	Selectivity			selec;
+ 
+ 	if (HeapTupleIsValid(vardata->statsTuple))
+ 	{
+ 		TSQuery				query;
+ 		Form_pg_statistic	stats;
+ 		Datum				*values;
+ 		int					nvalues;
+ 		float4				*numbers;
+ 		int					nnumbers;
+ 
+ 		/* The caller made sure the const is a TSQuery, so get it now */
+ 		query = DatumGetTSQuery(constval);
+ 
+ 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+ 
+ 		/* MCELEM will be an array of Text elements for a tsvector column */
+ 		if (get_attstatsslot(vardata->statsTuple,
+ 							 TEXTOID, -1,
+ 							 STATISTIC_KIND_MCELEM, InvalidOid,
+ 							 &values, &nvalues,
+ 							 &numbers, &nnumbers))
+ 		{
+ 			/*
+ 			 * There is a most-common-elements slot for the tsvector Var, so
+ 			 * use that.
+ 			 */
+ 
+ 			selec = mcelem_tsquery_selec(query, values, nvalues,
+ 										 numbers, nnumbers);
+ 			free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
+ 		}
+ 		else
+ 		{
+ 			/* No most-common-elements slot */
+ 			selec = (Selectivity) DEFAULT_TS_SEL;
+ 		}
+ 	}
+ 	else
+ 	{
+ 		selec = (Selectivity) DEFAULT_TS_SEL;
+ 	}
+ 
+ 	return selec;
+ }
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 ****
  DATA(insert OID = 3631 (  ">="	   PGNSP PGUID b f f 3614	 3614	 16 3628 3627	 tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"	   PGNSP PGUID b f f 3614	 3614	 16 3627 3628	 tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"	   PGNSP PGUID b f f 3614	 3614	 3614  0	0	 tsvector_concat   -	-	  ));
! DATA(insert OID = 3636 (  "@@"	   PGNSP PGUID b f f 3614	 3615	 16 3637	0	 ts_match_vq   contsel	   contjoinsel	 ));
! DATA(insert OID = 3637 (  "@@"	   PGNSP PGUID b f f 3615	 3614	 16 3636	0	 ts_match_qv   contsel	   contjoinsel	 ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614	 3615	 16 3661	0	 ts_match_vq   contsel	   contjoinsel	 ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615	 3614	 16 3660	0	 ts_match_qv   contsel	   contjoinsel	 ));
  DATA(insert OID = 3674 (  "<"	   PGNSP PGUID b f f 3615	 3615	 16 3679 3678	 tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="	   PGNSP PGUID b f f 3615	 3615	 16 3678 3679	 tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="	   PGNSP PGUID b t f 3615	 3615	 16 3676 3677	 tsquery_eq eqsel eqjoinsel ));
--- 915,924 ----
  DATA(insert OID = 3631 (  ">="	   PGNSP PGUID b f f 3614	 3614	 16 3628 3627	 tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"	   PGNSP PGUID b f f 3614	 3614	 16 3627 3628	 tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"	   PGNSP PGUID b f f 3614	 3614	 3614  0	0	 tsvector_concat   -	-	  ));
! DATA(insert OID = 3636 (  "@@"	   PGNSP PGUID b f f 3614	 3615	 16 3637	0	 ts_match_vq   tssel	   contjoinsel	 ));
! DATA(insert OID = 3637 (  "@@"	   PGNSP PGUID b f f 3615	 3614	 16 3636	0	 ts_match_qv   tssel	   contjoinsel	 ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614	 3615	 16 3661	0	 ts_match_vq   tssel	   contjoinsel	 ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615	 3614	 16 3660	0	 ts_match_qv   tssel	   contjoinsel	 ));
  DATA(insert OID = 3674 (  "<"	   PGNSP PGUID b f f 3615	 3615	 16 3679 3678	 tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="	   PGNSP PGUID b f f 3615	 3615	 16 3678 3679	 tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="	   PGNSP PGUID b t f 3615	 3615	 16 3676 3677	 tsquery_eq eqsel eqjoinsel ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4321,4326 ****
--- 4321,4328 ----
  DATA(insert OID = 3701 (  gtsquery_consistent			PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent _null_ _null_ _null_ ));
  DESCR("GiST tsquery support");
  
+ DATA(insert OID = 3687 (  tssel	PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+ DESCR("restriction selectivity of tsvector @@ tsquery");
  DATA(insert OID = 3688 (  ts_typanalyze	PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
  DESCR("tsvector typanalyze");
  
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..b253327 100644
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 155,160 ****
--- 155,162 ----
  
  extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
  
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  
  /*
   * TSQuery
***************
*** 291,294 ****
--- 293,309 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
  
+ /*
+  * The default text search selectivity is chosen to be smll enough to
+  * encourage indexscans for typical table densities. See selfuncs.h and
+  * DEFAULT_EQ_SEL for details.
+  */
+ #define DEFAULT_TS_SEL 0.005
+ 
+ /*
+  * operator restriction function for tsvector @@ tsquery and
+  * tsquery @@ tsvector
+  */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  #endif   /* _PG_TSTYPE_H_ */


Home | Main Index | Thread Index

Privacy Policy | About PostgreSQL
Copyright © 1996 – 2012 PostgreSQL Global Development Group