Per-document document statistics

From: Berend Tober <btober(at)computer(dot)org>
To: Postgres General <pgsql-general(at)postgresql(dot)org>
Subject: Per-document document statistics
Date: 2018-05-23 03:14:20
Message-ID: dbd69022-c4b8-c4d8-8602-5de5bcb18d17@computer.org
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-general


I'm trying to figure out how to compile text search statistics on a per-document basis.
While I successfully compute text search statistics for the entire corpus with a call
to ts_stat after having inserted all documents, what I also want is to run ts_stat on
the tsvector for each row so as to get the term frequency per document.

Sample code and comments follow.

-- Dumped from database version 9.5.7

CREATE DATABASE nlp;

\connect nlp

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;

SET search_path = public, pg_catalog;

-- This table stores one document per row

CREATE TABLE document (
document_id serial primary key,
content text NOT NULL,
document_vector tsvector
);

-- This is the table I need help with how to to populate
-- with term frequency per document

CREATE TABLE document_statistics (
document_id integer primary key,
word text,
ndoc bigint, /* this will be one, since there is only one document */
nentry bigint /* this is the number of interest */
);

ALTER TABLE ONLY document_statistics
ADD CONSTRAINT document_statistics_document_id_fkey
FOREIGN KEY (document_id)
REFERENCES document(document_id);

CREATE FUNCTION document_bit() RETURNS trigger
LANGUAGE plpgsql
AS $$
BEGIN
-- Compile document statistics for each document upon insert

SELECT to_tsvector('simple', new.content) INTO new.document_vector;
RETURN new;
END;
$$;

CREATE TRIGGER document_bit
BEFORE INSERT OR UPDATE ON document
FOR EACH ROW EXECUTE PROCEDURE document_bit();

-- Sample data

INSERT INTO document (content) VALUES ('Hello World!');
INSERT INTO document (content) VALUES ('The quick brown dog jumped over the lazy dog');
INSERT INTO document (content) VALUES ('One flew over the coo coo''s nest',);

-- Once all the individual documents are inserted, then
-- calculate overall corpus statistics

insert into corpus_statistics select * from ts_stat('select document_vector from document');

-- I'm thinking something like this proposed after insert trigger
-- is where I want to compute document statistics, but can't
-- figure out how to make it work

CREATE FUNCTION document_ait() RETURNS trigger
LANGUAGE plpgsql
AS $$
DECLARE
word_stat record;
BEGIN
/*
--Here's one pathetic try

FOR word_stat IN
select * from ts_stat('select * from ' || (new.document_vector))
LOOP
RAISE NOTICE '%' , word_stat;
INSERT INTO public.document_statistics(
document_id, word, ndoc, nentry)
VALUES (new.document_id, word_stat.word, word_stat.ndoc, word_stat.nentry);
END LOOP;
*/
RETURN new;
END;
$$;

CREATE TRIGGER document_ait AFTER INSERT ON document FOR EACH ROW EXECUTE PROCEDURE document_ait();

Browse pgsql-general by date

  From Date Subject
Next Message Jayadevan M 2018-05-23 05:15:26 PostgreSQL backup issue
Previous Message Kyotaro HORIGUCHI 2018-05-23 01:34:09 Re: Errors with physical replication