Database Patterns: August 2014

Tuesday, August 26, 2014

TF-IDF Text Search in Postgres


/* see http://blog.databasepatterns.com/2014/07/postgresql-install-smlar-extension.html */
create extension if not exists smlar;

/* your basic table to search against: */
create table documents (
  document_id int primary key,
  body text not null
);


/* 
   I created 100,000 "lorem ipsum" documents here http://www.mockaroo.com/c5418bd0
   In retrospect, not a great choice due to the small number of unique words used to generate the dataset
*/
copy documents 
from program 'curl "http://www.mockaroo.com/c5418bd0/download?count=100000&key=5b15a410"' 
with (format csv, header true);


/* this table holds document frequencies (# of docs in which a term appears) for the documents.body column: */
create table documents_body_stats (
  value text unique,
  ndoc int not null
);


/* used ts_stat for convenience, not ideal, but good for quick n dirty: */
insert into documents_body_stats
  select
    word,
    ndoc
  from 
    ts_stat( 'select to_tsvector(''simple'', body) from documents' );


/* the smlar tfdif table needs the total document count as well. It's added as a row with null in the value column: */
insert into documents_body_stats values 
  (null, (select count(*) from documents) );


/* turn documents into array of words. you could also use tsvector2textarray( to_tsvector(...) ) : */
create or replace function tokenize(text) returns text[] language sql strict immutable as $$
 select regexp_split_to_array( lower($1), '[^[:alnum:]]' );
$$;


/* use smlar's text array opclass. gist is a little more flexible than gin in this case (allows 'n' tf_method): */
create index on documents using gist ( tokenize(body) _text_sml_ops ); --24 seconds


/* setup smlar: */
set smlar.type = 'tfidf';
set smlar.stattable = 'documents_body_stats';
set smlar.tf_method = 'n';
set smlar.threshold = 0.4;


/* the query */
select
  *,
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] )
from
  documents
where
  tokenize(body) % '{fringilla,malesuada,euismod}'::text[] --where TFIDF similarity >= smlar.threshold
order by
  smlar( tokenize(body), '{fringilla,malesuada,euismod}'::text[] ) desc
limit 10;

Wednesday, August 20, 2014

PL/V8 vs PL/pgSQL

A not-so-scientific test comparing PL/V8 and PL/pgSQL.

Actual database access seems about the same, but from everything I have seen, PL/V8 always beats PL/pgSQL in terms of performance for logic.

create extension if not exists plv8;

create or replace function fizz_buzz_plv8() returns text volatile language plv8 as $$

 var result = "";

 for(var i=0; i <= 100000; i++) {

  if(i % 3 == 0) result += "Fizz";

  if(i % 5 == 0) result += "Buzz";

  if(result == "") result += i;
 }

 return result;
$$;

create or replace function fizz_buzz_plpgsql() returns text volatile language plpgsql as $$
declare
 result text = '';
begin

 for i in 0 .. 100000 loop
  if i % 3 = 0 then result = result || 'Fizz'; end if;

  if i % 5 = 0 then result = result || 'Buzz'; end if;

  if result = '' then result = result || i; end if;
 end loop;

 return result;
end
$$;

select fizz_buzz_plv8(); // 52 ms

select fizz_buzz_plpgsql(); // 1292 ms

Saturday, August 16, 2014

pgAgent wishlist

Be able to LISTEN to notifications from a Postgres database, and run a job depending on which notification is fired
Allow send email after job finishes, configurable to send only on error. You don't need to make your own MTA, just drop a message in localhost:25
A yum package
Use RFC 5545 RRULES, as it's the standard for scheduling
Fix the bug that shows failure when a Windows batch file returns success

Send email on error with pgAgent

pgAgent is a scheduler for Postgres. It's not bad, but doesn't have the ability to email an admin when there's an error.

Here's a hackish work-around to get that going.

Install pgAgent

On Windows, this is most easily done with Stack Builder from EnterpriseDB. Make sure the pgAgent service is running after you install pgAgent

(Optional) Install and set up a localhost Mail Transfer Agent (MTA)

Configuring your SMTP server settings in multiple programs isn't DRY, so it's better to set up a localhost MTA and have it relay to your SMTP server. It's also dumb to have your program wait for a response from the SMTP server. A localhost MTA gives you a free queue where you can drop a message in localhost:25 and not wait.

For developing/testing on Windows, you can use smtp4dev, which will act as a localhost MTA, but open incoming messages in your default email client, such as Outlook.

(Windows only) Get a command line SMTP client

I recommend blat. It's a simple command line SMTP client.

You can send an email like this

blat -t to@example.com -f from@example.com -subject "The Subject" -body "The body" -server host:port

Add blat to the PATH.

Create Some Views in the postgres database

Get the last time that the email job was run successfully, or -infinity:

create view email_job_last_success as 
  SELECT 
    COALESCE(
    ( SELECT 
      a.jlgstart
    FROM 
      pgagent.pga_joblog a
    JOIN 
      pgagent.pga_job b 
    ON 
      a.jlgjobid = b.jobid
    WHERE 
      b.jobname = 'email_job' 
      AND a.jlgstatus = 's' --success
    ORDER BY 
      a.jlgstart DESC
    LIMIT 1 ), 
    '-infinity'::timestamptz) AS jlgstart;

Get the list of steps that failed since the email job was last run successfully:

create view failed_steps AS 
  SELECT  
    *
  FROM 
    pgagent.pga_jobsteplog a
  JOIN 
    pgagent.pga_jobstep b 
  ON 
    a.jsljstid = b.jstid
  JOIN 
    pgagent.pga_job c 
  ON 
    b.jstjobid = c.jobid
  WHERE 
    jslstatus = 'f' --failure
    AND jobname <> 'email_job'
    AND jslstart > ( SELECT jlgstart FROM email_job_last_success)
  ORDER BY 
    jslstart DESC;

Create the Email Job

Create a job called email_job (if you use a different name, you need to change the views above)
Create a schedule for the job. Have it run say every five minutes.
Add a step

Name: send_email , or whatever
Connection type: local
Database: postgres
Kind: SQL
On error: Fail
Definition:

do $$
begin

  /* if no failed steps, then don't send email: */
  if (select count(*) = 0 from failed_steps) then return; end if;

  copy (
    select
      jstdbname as "Database",
      jobname as "Job",
      jstname as "Step",
      jslstart as "Step Started At",
      jslresult as "Result Code",
      jsloutput as "Output"
    from
      failed_steps
  )
  to program 
    'blat -t dba@example.com -f pgAgent@example.com -server localhost:25 -subject "pgAgent job failure"' 
  with (
    format csv, 
    header true, 
    delimiter E'\t'
  );

end $$;

Change the line after "to program" above to customize your email message. Use sendmail instead of blat on Linux. The email job will run every X minutes. It will see if there are any steps (other than its own) that failed since the email job was last run successfully.

If there are failed steps, then it will COPY them (via stdin) to the email program as CSV. The CSV will appear as the email's body. The email program will send one email with all of the failed steps.

Friday, August 15, 2014

My wish list for Postgres 9.5

Upsert, as always
Replication with SQLite over WebSocket, with full Row Security support. Now every mobile company on the planet wants to use Postgres to sync with Android and iPhone
Automatic HATEOAS RESTful API for data and schemas: JSON and CSV. CouchDB killer.
Incremental / Differential backup & restore over HTTPS (like to S3)
Access O/S environment variables in PL/pgSQL. Use case: So I can keep COPY paths the same in Windows and *Nix
bcrypt or scrypt role passwords (instead of MD5)
skip_lines (default 0) and skip_comments (default '#') options for COPY so I can skip more than the first line in a CSV. Popular use case: geonames
Make it dead easy for extension developers to compile on both *Nix and Windows. There are too many extensions that don't work on Windows
SQL:2011 Temporal features
Make Text Search a little more like Solr, basically just integrate the smlar extension

Monday, August 11, 2014

Getting geocoder.ca Canadian Postal Codes into Postgres


--download and unzip http://geocoder.ca/onetimedownload/Canada.csv.gz (9MB) to /srv/geocoder.ca/zipcodeset.txt


create table canadian_postal_codes (
  postal_code text primary key,
  latitude numeric(8,6) not null,
  longitude numeric(9,6) not null,
  place_name text,
  province_code char(2) not null
);

copy canadian_postal_codes from '/srv/geocoder.ca/zipcodeset.txt' with ( format csv, null '', encoding 'latin1' );

--there are two postal codes that are too long:
delete from canadian_postal_codes where length(postal_code) > 6;

--there are some that are lower case:
update canadian_postal_codes set postal_code = upper(postal_code) where postal_code ~ '[a-z]';

--some are "impossible", i.e. start with wrong letter for province, will update here later.

Tuesday, August 5, 2014

geonames continents


create table continents (
 continent_code char(2) primary key,
 name text not null unique,
 geoname_id int not null unique
);

insert into continents values
('AF', 'Africa', 6255146),
('AS', 'Asia', 6255147),
('EU', 'Europe', 6255148),
('NA', 'North America', 6255149),
('OC', 'Oceania', 6255151),
('SA', 'South America', 6255150),
('AN', 'Antarctica', 6255152);

Sunday, August 3, 2014

Norvig-Style Spelling Correction in Postgres. It's reasonably fast.

I wanted to do some data matching but the input data had errors.

I decided to try Peter Norvig's spelling corrector directly in the database.

I got pretty good results. The same accuracy as Norvig, and only marginally slower (13.5 Hz vs 17 Hz). It started out about 10x slower, but I was able to speed it up after a few hours of tweaking.

Response time for a single correctly spelled word is < 10ms. For a word with edit distance 1, the response time is also < 10ms. For edit distance 2, the response time is around 100ms.

I used PL/V8 for most of it, as it seems to be the fastest of the PL languages, for calculations at least.

Here's the code:

This function finds all words with an edit distance of 1 of the target word. There will be 54*length+25 words. For example, if the target word is "united" (length 6), then there will be 349 words returned. Some will be nonsense words, so we match it against our corpus later.

/* inspired by http://blog.astithas.com/2009/08/spell-checking-in-javascript.html */

CREATE OR REPLACE FUNCTION edits1(text) RETURNS text[] LANGUAGE plv8 IMMUTABLE STRICT AS '
 var i, results = [], letters = "abcdefghijklmnopqrstuvwxyz".split("");

 // deletion
 for (i=0; i < $1.length; i++)
     results.push($1.slice(0, i) + $1.slice(i+1));

 // transposition
 for (i=0; i < $1.length-1; i++)
     results.push($1.slice(0, i) + $1.slice(i+1, i+2) + $1.slice(i, i+1) + $1.slice(i+2));
  
 // alteration
 for (i=0; i < $1.length; i++)
     letters.forEach(function (l) {
         results.push($1.slice(0, i) + l + $1.slice(i+1));
 });

 // insertion
 for (i=0; i <= $1.length; i++)
     letters.forEach(function (l) {
         results.push($1.slice(0, i) + l + $1.slice(i));
 });

 return results;
';

This one finds all the words with an edit distance of 2. We find the 2-distance words of all the 1-distance words, so there are about 100,000 unique words returned. Most are nonsense, so we join them against the corpus later.

CREATE OR REPLACE FUNCTION edits2(text[]) RETURNS text[] LANGUAGE plv8 IMMUTABLE STRICT AS '
 
 var edits1 = plv8.find_function("edits1");

 var words = {}; //using object keys avoids duplicates, dropping edits2 from ~200k to 100k

 for(var i = 0; i < $1.length; i++) {

  var e2 = edits1($1[i]); 

  for(var j = 0; j < e2.length; j++) {
   words[e2[j]] = null;
  }
 }

 return Object.keys(words);

';

And this one does the actual correcting. It has to be volatile due to the temp table creation. Create the temp table outside the function and you can mark it as stable, which will let Postgres cache results in the same transaction.

"big" is a table with unique terms and their counts, based on the large corpus here http://norvig.com/big.txt (6M chars, ~30,000 unique words).

CREATE OR REPLACE FUNCTION correct(text) RETURNS text LANGUAGE plpgsql VOLATILE STRICT AS $$
declare
 tmp text;
 e1 text[];
begin

  --if the word is in the corpus, return it:
  if exists( select 1 from big where word = $1 ) then 
    return $1; 
  end if;

  --get all the 1-distance words:
  e1 = edits1($1);
 
  select 
    word 
  from big 
  where 
    word = any ( select unnest( e1 ) ) 
  order by 
    nentry desc 
  limit 1 
  into tmp;
 
  --if there are 1-distance words that match, return the most common one:
  if found then 
    return tmp; 
  end if;

  --using a session temp table is much faster than comparing to an array
  create temp table if not exists edits2temp ( word text );
  truncate table edits2temp;

  --get all the 2-distance edits (~100k of them) and put in temp table:
  insert into edits2temp select unnest( edits2(e1) );

  select
    big.word
  from 
    big
    inner join edits2temp edits on big.word = edits.word
  order by big.nentry desc
  limit 1
  into tmp;

  --if there are 2-distance words that match, return the most common one:
  if found then 
    return tmp; 
  end if;

  truncate table edits2temp; 

  --nothing found, return the original word:
 return $1;

end $$;

Saturday, August 2, 2014

postgresql where short circuit

select
  'whatever'
where
  case 
    when cheap_function() then true
    when expensive_function() then true
  end;

Database Patterns

Pages