Python Unicode Normalize in list of words

417 Views Asked by At

I am preprocessing a list of words from a file. I'm struggling to remove accents because the Unicode Normalizer works on strings only. I am getting the following error :

TypeError: normalize() argument 2 must be str, not list

Any way to remove accents from the entire list ?

Many thanks

import string
import nltk
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from nltk.corpus import stopwords
stopwords = stopwords.words('french')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
french_stopwords = nltk.corpus.stopwords.words('french')
from unicodedata import normalize
lemmatizer = FrenchLefffLemmatizer()

def preprocessing(affaires):
    preprocess_list = []
    for sentence in affaires :
        sentence_w_punct = "".join([i.lower() for i in sentence if i not in string.punctuation])
        tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_punct)
        words_w_stopwords = [i for i in tokenize_sentence if i not in french_stopwords]
        no_accent = ''.join(c for c in unicodedata.normalize('NFD', words_w_stopwords)
                  if unicodedata.category(c) != 'Mn')  
        remove_parasites = [j for j in no_accent if j not in parasites]
        words_lemmatize = (lemmatizer.lemmatize(w) for w in remove_parasites)
        sentence_clean = ' '.join(words_lemmatize)
        preprocess_list.append(sentence_clean)

    return preprocess_list

df["nom_affaire_clean"] = preprocessing(df["nom_affaire"])

cln = df.pop("nom_affaire_clean")
df.insert(1, 'nom_affaire_clean', cln )
df

1

There are 1 best solutions below

0
Mark Tolonen On

unicodedata.normalize doesn't work on a list, so enumerate the list and convert each word:

import unicodedata as ud

words = '''âcre âge âgé arriéré arrière bronzé collé congrès coté côte côté crêpe
           crêpé cure curé dès différent diffèrent entré mémé même pâte pâté péché
           pêche pécher pêcher pécheur pêcheur prête prêté relâche relâché retraité
           sublimé vôtre'''.split()

for index, word in enumerate(words):
    words[index] = ''.join(c for c in ud.normalize('NFD', word) if ud.category(c) != 'Mn')

print(words)

Output:

['acre', 'age', 'age', 'arriere', 'arriere', 'bronze', 'colle', 'congres', 'cote', 'cote', 'cote', 'crepe', 'crepe', 'cure', 'cure', 'des', 'different', 'different', 'entre', 'meme', 'meme', 'pate', 'pate', 'peche', 'peche', 'pecher', 'pecher', 'pecheur', 'pecheur', 'prete', 'prete', 'relache', 'relache', 'retraite', 'sublime', 'votre']