/* see http://blog.databasepatterns.com/2014/07/postgresql-install-smlar-extension.html */ create extension if not exists smlar; /* your basic table to search against: */ create table documents ( document_id int primary key, body text not null ); /* I created 100,000 "lorem ipsum" documents here http://www.mockaroo.com/c5418bd0 In retrospect, not a great choice due to the small number of unique words used to generate the dataset */ copy documents from program 'curl "http://www.mockaroo.com/c5418bd0/download?count=100000&key=5b15a410"' with (format csv, header true); /* this table holds document frequencies (# of docs in which a term appears) for the documents.body column: */ create table documents_body_stats ( value text unique, ndoc int not null ); /* used ts_stat for convenience, not ideal, but good for quick n dirty: */ insert into documents_body_stats select word, ndoc from ts_stat( 'select to_tsvector(''simple'', body) from documents' ); /* the smlar tfdif table needs the total document count as well. It's added as a row with null in the value column: */ insert into documents_body_stats values (null, (select count(*) from documents) ); /* turn documents into array of words. you could also use tsvector2textarray( to_tsvector(...) ) : */ create or replace function tokenize(text) returns text[] language sql strict immutable as $$ select regexp_split_to_array( lower($1), '[^[:alnum:]]' ); $$; /* use smlar's text array opclass. gist is a little more flexible than gin in this case (allows 'n' tf_method): */ create index on documents using gist ( tokenize(body) _text_sml_ops ); --24 seconds /* setup smlar: */ set smlar.type = 'tfidf'; set smlar.stattable = 'documents_body_stats'; set smlar.tf_method = 'n'; set smlar.threshold = 0.4; /* the query */ select *, smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] ) from documents where tokenize(body) % '{fringilla,malesuada,euismod}'::text[] --where TFIDF similarity >= smlar.threshold order by smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] ) desc limit 10;
Tuesday, August 26, 2014
TF-IDF Text Search in Postgres
Subscribe to:
Post Comments (Atom)
5 comments:
We must know how to write company profilewhen we have different profiles to write.This is really incredible to have.
Thank you very much for this post!
smlar is, unfortunately, poorly documented and your post is very valuable.
In your example you put an index on the documents table, but no index on the stats table.
Would adding an index on the stats table make any sense?
@Zltako, yes you're right. An index on the "value" column might speed things up. Thanks
Post a Comment