Lucene: Way to have case-insensitive MappingCharFilter or apply LowerCaseFilter before it?

72 Views Asked by At

Is there a way to have a LowerCaseFilter before the MappingCharFilter? I realize I could just lowercase the search input before passing into Lucene but this Analyzer is only for a specific field so it'd be much more preferable to pack the lowercase into the analyzer. I did come across a regex filter but I have ~100 terms in my map and didn't think it was prudent to have ~100 regex filters.

    public class CustomAnalyzer : Analyzer
{
    protected override TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader)
    {
        var charFilter = new MappingCharFilter(MyNormalizedCharMap(), reader);
        Tokenizer tokenizer = new StandardTokenizer(IndexConfig.LUCENE_VERSION, charFilter);
        TokenStream tokenStream = new StandardFilter(IndexConfig.LUCENE_VERSION, tokenizer);

        return new TokenStreamComponents(tokenizer, tokenStream);
    }
}

I'm thinking that maybe I need a custom TokenFilter to support this, if you have a good example of how to write one for Lucene.net, please share!

1

There are 1 best solutions below

0
azulBonnet On
public class NormalizingAnalyzer : Analyzer
{
    private static Dictionary<string, string> _normalizingDictionary;

    static NormalizingAnalyzer()
    {
        _normalizingDictionary = MyNormalizingDictionary();
    }

    protected override TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader)
    {
        Tokenizer tokenizer = new StandardTokenizer(IndexConfig.LUCENE_VERSION, reader);
        TokenStream tokenStream = new LowerCaseFilter(IndexConfig.LUCENE_VERSION, tokenizer);
        tokenStream = new DictionaryReplaceFilter(tokenStream, _normalizingDictionary);

        return new TokenStreamComponents(tokenizer, tokenStream);
    }
}

Alternatively to setting the LowerCaseFilter can create your dictionary with ingore case: new Dictionary<string,string>(StringComparer.OrdinalIgnoreCase)

public sealed class DictionaryReplaceFilter : TokenFilter
{
    private readonly ICharTermAttribute _termAttribute;
    private readonly Dictionary<string, string> _termReplacements;

    public DictionaryReplaceFilter(TokenStream input, Dictionary<string, string> termReplacements) : base(input)
    {
        _termAttribute = this.GetAttribute<ICharTermAttribute>();
        _termReplacements = termReplacements;
    }

    public override bool IncrementToken()
    {
        if (m_input.IncrementToken())
        {
            string currentTerm = _termAttribute.ToString();

            // Check if the current term exists in the dictionary
            if (_termReplacements.TryGetValue(currentTerm, out string replacement))
            {
                _termAttribute.SetEmpty().Append(replacement);
            }

            return true;
        }
        return false;
    }
}