I'm learning Elasticsearch using Python client and have managed to build an index and query function.
Problem: Even if I have used stemmer and stop words in the below settings, when I query a text with all the stop words to test the working, it returns some result when it shouldn't as these are all stop words and should be removed actually. What am I doing wrong here?
# Crete Own Stemmer
STEMMER_FILTER = {
"type":"stemmer",
"language": "english",
}
# create own Stopwords
STOPWORD_FILTER = {
"type":"stop",
"stopwords":"_english_",
"ignore_case": True,
}
# Create your own Synonyms and mappings
SYNONYM_FILTER = {
"type":"synonym",
"synonyms":[
"i-pod, ipod",
"universe, cosmos"
]
}
custom_synonyms = {
"type": "synonym_graph",
"synonyms": [
"mind, brain",
"brain storm, brainstorm, envisage"
]
}
custom_index_analyzer = {
"tokenizer": "whitespace", #"standard",
"filter": [
"lowercase",
"asciifolding",
"stemmer", # how to use custom stemmer?
"stop"]} # how to use custom stop words?
custom_search_analyzer = {
"tokenizer": "whitespace", #"standard"
"filter": [
"lowercase",
"asciifolding",
"stemmer",
"stop"]}
INDEX_BODY = {
"settings": {
"index": {
"analysis": {
"analyzer": {
"custom_index_time_analyzer": custom_index_analyzer,
"custom_search_time_analyzer": custom_search_analyzer
},
"filter": {"my_graph_synonyms": custom_synonyms,
"english_stemmer": STEMMER_FILTER,
"english_stop": STOPWORD_FILTER,
"synonym": SYNONYM_FILTER},
}
}
},
"mappings": {
"properties": {
"que_op": {"type": "text", "analyzer": "custom_index_time_analyzer", "search_analyzer": "custom_search_time_analyzer"}
}}}
es.indices.create(index="questions", mappings=INDEX_BODY["mappings"], settings=INDEX_BODY["settings"])
bulk_data = []
for i,row in tqdm(df.iterrows()):
bulk_data.append(
{
"_index": index_name,
"_id": i,
"_source": {
"que_op": row["que_op"]
}
}
)
bulk(es, bulk_data)
Now the problem is that when I search a list of all the stopwords in Elasticsearch , it gives me results.
def full_text_search(index_name:str, query_string:str, search_on_field:str = 'que_op', size:int = 10):
query = {"match": {search_on_field: query_string}}
return es.search(index = index_name, query = query, size = size, pretty = True)
full_text_search("questions", "a an and are as at be but by for if in into is it no not of on or such that the their then there these they this to was will with", size = 3)