Database Patterns: TF-IDF Text Search in Postgres

Tuesday, August 26, 2014

TF-IDF Text Search in Postgres


/* see http://blog.databasepatterns.com/2014/07/postgresql-install-smlar-extension.html */
create extension if not exists smlar;

/* your basic table to search against: */
create table documents (
  document_id int primary key,
  body text not null
);


/* 
   I created 100,000 "lorem ipsum" documents here http://www.mockaroo.com/c5418bd0
   In retrospect, not a great choice due to the small number of unique words used to generate the dataset
*/
copy documents 
from program 'curl "http://www.mockaroo.com/c5418bd0/download?count=100000&key=5b15a410"' 
with (format csv, header true);


/* this table holds document frequencies (# of docs in which a term appears) for the documents.body column: */
create table documents_body_stats (
  value text unique,
  ndoc int not null
);


/* used ts_stat for convenience, not ideal, but good for quick n dirty: */
insert into documents_body_stats
  select
    word,
    ndoc
  from 
    ts_stat( 'select to_tsvector(''simple'', body) from documents' );


/* the smlar tfdif table needs the total document count as well. It's added as a row with null in the value column: */
insert into documents_body_stats values 
  (null, (select count(*) from documents) );


/* turn documents into array of words. you could also use tsvector2textarray( to_tsvector(...) ) : */
create or replace function tokenize(text) returns text[] language sql strict immutable as $$
 select regexp_split_to_array( lower($1), '[^[:alnum:]]' );
$$;


/* use smlar's text array opclass. gist is a little more flexible than gin in this case (allows 'n' tf_method): */
create index on documents using gist ( tokenize(body) _text_sml_ops ); --24 seconds


/* setup smlar: */
set smlar.type = 'tfidf';
set smlar.stattable = 'documents_body_stats';
set smlar.tf_method = 'n';
set smlar.threshold = 0.4;


/* the query */
select
  *,
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] )
from
  documents
where
  tokenize(body) % '{fringilla,malesuada,euismod}'::text[] --where TFIDF similarity >= smlar.threshold
order by
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] ) desc
limit 10;

5 comments:

Rodolphe Quiédeville said...: This comment has been removed by the author.; January 16, 2016 at 8:21 AM
aliyaa said...: We must know how to write company profilewhen we have different profiles to write.This is really incredible to have.; January 8, 2017 at 2:26 AM
Zlatko Matić said...: Thank you very much for this post!
smlar is, unfortunately, poorly documented and your post is very valuable.; April 13, 2017 at 2:42 AM
Zlatko Matić said...: In your example you put an index on the documents table, but no index on the stats table.
Would adding an index on the stats table make any sense?; April 13, 2017 at 4:44 AM
Neil McGuigan said...: @Zltako, yes you're right. An index on the "value" column might speed things up. Thanks; April 13, 2017 at 10:57 AM

Database Patterns

Pages

Tuesday, August 26, 2014

TF-IDF Text Search in Postgres

5 comments:

Blog Archive