I have built a search engine using Postgres that is working pretty well. I have used hunspell dictionaries for the main languages I support, this is how I set them up:
CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;
ALTER TEXT SEARCH CONFIGURATION english_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
english_hunspell,
english_stem;
CREATE TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell (
COPY = portuguese_brazil_hunspell
);
ALTER TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
portuguese_brazil_hunspell,
portuguese_stem;
CREATE TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell (
COPY = spanish_hunspell
);
ALTER TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
spanish_hunspell,
spanish_stem;
CREATE TEXT SEARCH CONFIGURATION italian_unaccent_hunspell (
COPY = italian_hunspell
);
ALTER TEXT SEARCH CONFIGURATION italian_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
italian_hunspell,
italian_stem;
CREATE TEXT SEARCH CONFIGURATION russian_unaccent_hunspell (
COPY = russian_hunspell
);
ALTER TEXT SEARCH CONFIGURATION russian_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
russian_hunspell,
russian_stem;
CREATE TEXT SEARCH CONFIGURATION french_unaccent_hunspell (
COPY = french_hunspell
);
ALTER TEXT SEARCH CONFIGURATION french_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
french_hunspell,
french_stem;
CREATE TEXT SEARCH CONFIGURATION german_unaccent_hunspell (
COPY = german_hunspell
);
ALTER TEXT SEARCH CONFIGURATION german_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
german_hunspell,
german_stem;
ALTER TABLE "earliest_search_indices"
ADD COLUMN "documentFts" tsvector;
ALTER TABLE "latest_search_indices"
ADD COLUMN "documentFts" tsvector;
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('english_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('english_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('english_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'english';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'portuguese';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'spanish';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('french_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('french_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('french_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'french';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('italian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'italian';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('german_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('german_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('german_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'german';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('russian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'russian';
CREATE INDEX entries_document_fts ON "earliest_search_indices" USING GIN ("documentFts");
The dictionaries I use live here:
https://github.com/ericmackrodt/hunspell_dicts
That's all good, and it behaves exactly how I want, but there are some issues due to the stop word elimination. For the most part it works great, but there are some exceptions where keeping the stop words would be super relevant. Here are some examples:
- The Sims - This results in a search for "sims" as the word "the" is eliminated.
- Doctor Who - This results in a search for "doctor" as the word "who" is eliminated.
- The Who - This results in a search for "" as both "the" and "who" are eliminated.
So, my question is, how could I add those kinds of exceptions to my dictionaries? Like, if the word "who" is preceded by "doctor", then index them together.
I don't mind having to add those exceptions by hand.
Thanks in advance.
You can change the list of stop words by configuring the needed dictionaries:
http://www.postgresql.org/docs/current/static/textsearch-dictionaries.html
as answered in this SO answer