Include stop words on pre-defined phrases in Postgres tsvector

Question

Include stop words on pre-defined phrases in Postgres tsvector

341 Views Asked by Eric.M At 04 August 2022 at 04:21

I have built a search engine using Postgres that is working pretty well. I have used hunspell dictionaries for the main languages I support, this is how I set them up:

CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;

ALTER TEXT SEARCH CONFIGURATION english_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  english_hunspell,
  english_stem;

CREATE TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell (
  COPY = portuguese_brazil_hunspell
);

ALTER TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  portuguese_brazil_hunspell,
  portuguese_stem;

CREATE TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell (
  COPY = spanish_hunspell
);

ALTER TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  spanish_hunspell,
  spanish_stem;

CREATE TEXT SEARCH CONFIGURATION italian_unaccent_hunspell (
  COPY = italian_hunspell
);

ALTER TEXT SEARCH CONFIGURATION italian_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  italian_hunspell,
  italian_stem;

CREATE TEXT SEARCH CONFIGURATION russian_unaccent_hunspell (
  COPY = russian_hunspell
);

ALTER TEXT SEARCH CONFIGURATION russian_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  russian_hunspell,
  russian_stem;

CREATE TEXT SEARCH CONFIGURATION french_unaccent_hunspell (
  COPY = french_hunspell
);

ALTER TEXT SEARCH CONFIGURATION french_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  french_hunspell,
  french_stem;

CREATE TEXT SEARCH CONFIGURATION german_unaccent_hunspell (
  COPY = german_hunspell
);

ALTER TEXT SEARCH CONFIGURATION german_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  german_hunspell,
  german_stem;

ALTER TABLE "earliest_search_indices"
  ADD COLUMN "documentFts" tsvector;

ALTER TABLE "latest_search_indices"
  ADD COLUMN "documentFts" tsvector;

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('english_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('english_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('english_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'english';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'portuguese';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'spanish';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('french_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('french_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('french_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'french';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('italian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'italian';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('german_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('german_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('german_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'german';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('russian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'russian';

CREATE INDEX entries_document_fts ON "earliest_search_indices" USING GIN ("documentFts");

The dictionaries I use live here:

https://github.com/ericmackrodt/hunspell_dicts

That's all good, and it behaves exactly how I want, but there are some issues due to the stop word elimination. For the most part it works great, but there are some exceptions where keeping the stop words would be super relevant. Here are some examples:

The Sims - This results in a search for "sims" as the word "the" is eliminated.
Doctor Who - This results in a search for "doctor" as the word "who" is eliminated.
The Who - This results in a search for "" as both "the" and "who" are eliminated.

So, my question is, how could I add those kinds of exceptions to my dictionaries? Like, if the word "who" is preceded by "doctor", then index them together.

I don't mind having to add those exceptions by hand.

Thanks in advance.

Original Q&A

There are 1 best solutions below

**RSNboim** · Answer 1 · 2022-11-23T15:57:00.137000

RSNboim On 23 November 2022 at 15:57

You can change the list of stop words by configuring the needed dictionaries:

http://www.postgresql.org/docs/current/static/textsearch-dictionaries.html

as answered in this SO answer

Include stop words on pre-defined phrases in Postgres tsvector

There are 1 best solutions below

Related Questions in POSTGRESQL

Related Questions in DICTIONARY

Related Questions in FULL-TEXT-SEARCH

Related Questions in STOP-WORDS

Related Questions in TSVECTOR

Trending Questions

Popular # Hahtags

Popular Questions