Tuesday, August 26, 2014

TF-IDF Text Search in Postgres


/* see http://blog.databasepatterns.com/2014/07/postgresql-install-smlar-extension.html */
create extension if not exists smlar;

/* your basic table to search against: */
create table documents (
  document_id int primary key,
  body text not null
);


/* 
   I created 100,000 "lorem ipsum" documents here http://www.mockaroo.com/c5418bd0
   In retrospect, not a great choice due to the small number of unique words used to generate the dataset
*/
copy documents 
from program 'curl "http://www.mockaroo.com/c5418bd0/download?count=100000&key=5b15a410"' 
with (format csv, header true);


/* this table holds document frequencies (# of docs in which a term appears) for the documents.body column: */
create table documents_body_stats (
  value text unique,
  ndoc int not null
);


/* used ts_stat for convenience, not ideal, but good for quick n dirty: */
insert into documents_body_stats
  select
    word,
    ndoc
  from 
    ts_stat( 'select to_tsvector(''simple'', body) from documents' );


/* the smlar tfdif table needs the total document count as well. It's added as a row with null in the value column: */
insert into documents_body_stats values 
  (null, (select count(*) from documents) );


/* turn documents into array of words. you could also use tsvector2textarray( to_tsvector(...) ) : */
create or replace function tokenize(text) returns text[] language sql strict immutable as $$
 select regexp_split_to_array( lower($1), '[^[:alnum:]]' );
$$;


/* use smlar's text array opclass. gist is a little more flexible than gin in this case (allows 'n' tf_method): */
create index on documents using gist ( tokenize(body) _text_sml_ops ); --24 seconds


/* setup smlar: */
set smlar.type = 'tfidf';
set smlar.stattable = 'documents_body_stats';
set smlar.tf_method = 'n';
set smlar.threshold = 0.4;


/* the query */
select
  *,
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] )
from
  documents
where
  tokenize(body) % '{fringilla,malesuada,euismod}'::text[] --where TFIDF similarity >= smlar.threshold
order by
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] ) desc
limit 10;

5 comments:

Rodolphe Quiédeville said...
This comment has been removed by the author.
aliyaa said...

We must know how to write company profilewhen we have different profiles to write.This is really incredible to have.

Zlatko Matić said...

Thank you very much for this post!
smlar is, unfortunately, poorly documented and your post is very valuable.

Zlatko Matić said...

In your example you put an index on the documents table, but no index on the stats table.
Would adding an index on the stats table make any sense?

Neil McGuigan said...

@Zltako, yes you're right. An index on the "value" column might speed things up. Thanks