NLP preprocessing text in Data Frame, what is the correct order?

123 Views Asked by At

I’m trying to preprocess a data frame with two columns. Each cell contains a string, called "title" and "body".

Based on this article I tried to reproduce the preprocessing. However, there is clearly something I am not getting right, and it’s the order to process this or that, and have the correct type that each function expects. I keep getting errors of type list as no attribute str, or type str as no attribute str and so on.

Here is what I have done:

def lemmatize_pos_tagged_text(text, lemmatizer, post_tag_dict):
    sentences = nltk.sent_tokenize(text)
    new_sentences = []

    for sentence in sentences:
        sentence = sentence.lower()
        new_sentence_words = []    
        pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) 

        for word_idx, word in enumerate(nltk.word_tokenize(sentence)):
            nltk_word_pos = pos_tuples[word_idx][1]
            wordnet_word_pos = post_tag_dict.get(nltk_word_pos[0].upper(), None)
            if wordnet_word_pos is not None:
                new_word = lemmatizer.lemmatize(word, wordnet_word_pos)
            else:
                new_word = lemmatizer.lemmatize(word)

            new_sentence_words.append(new_word)

        new_sentence = " ".join(new_sentence_words)
        new_sentences.append(new_sentence)

    return " ".join(new_sentences)

def processing_steps(df):
    lemmatizer = WordNetLemmatizer()
    pos_tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    local_stopwords = set(stopwords.words('english'))
    additional_stopwords = ["http", "u", "get", "like", "let", "nan"]
    words_to_keep = ["i'" " i ", "me", "my", "we", "our", "us"]
    local_stopwords.update(additional_stopwords)
    for word in words_to_keep:
      if word in local_stopwords:
        words_to_keep.remove(word)

    for column in df.columns:

        # Tokenization
        df[column] = df[column].apply(lambda x: word_tokenize(x))

        # Lowercasing each word within the list
        df[column] = df[column].apply(lambda x: [word.lower() for word in x])

        # Removing stopwords
        df[column] = df[column].apply(lambda tokens: [word for word in tokens if word.isalpha() and word not in local_stopwords])

        # Replace diacritics
        df[column] = df[column].apply(lambda x: unidecode(x, errors="preserve"))

        # Expand contractions
        df[column] = df[column].apply(lambda x: " ".join([contractions.fix(expanded_word) for expanded_word in x.split()]))

        # Remove numbers
        df[column] = df[column].apply(lambda x: re.sub(r'\d+', '', x))

        # Typos correction
        df[column] = df[column].apply(lambda x: str(TextBlob(x).correct()))

        # Remove punctuation except period
        df[column] = df[column].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation.replace('.', '')), '' , x))

        # Remove double space
        df[column] = df[column].apply(lambda x: re.sub(' +', ' ', x))

        # Lemmatization
        df[column] = df[column].apply(lambda x: lemmatize_pos_tagged_text(x, lemmatizer, pos_tag_dict))

    return df

As an example, that’s the error message I get with the current state of the function. But keep in mind that whenever I try to change things, like commenting out the part for splitting, I would get another error of type, or attribute. So the question really is: what’s the proper order? How to handle the fact that different function need different types for processing the same element?:

     49 
     50         # Expand contractions
---> 51         df[column] = df[column].apply(lambda x: " ".join([contractions.fix(expanded_word) for expanded_word in x.split()]))
     52 
     53         # Remove numbers

AttributeError: 'list' object has no attribute 'split'

Any conceptual explanation is very welcome!

1

There are 1 best solutions below

0
On

I got it, the issue was with the in the first few lines of the processing_steps().

I am tokenizing the elements, thus making it a list of words, and then passing that to functions not expecting a list, but rather a string.

So I just had to add in the list comprehension to iterate through each list of each cell by adding … for word in x. Here is the completed, with some other adjustments as well:

def processing_steps(df):

    new_data = {}

    for column in df.columns:

        # Tokenization
        results = df[column].apply(word_tokenize)

        # Lowercasing each word within the list
        results = results.apply(lambda x: [word.lower() for word in x])

        # Removing stopwords
        results = results.apply(lambda tokens: [word for word in tokens if word.isalpha() and word not in local_stopwords])

        # Replace diacritics
        results = results.apply(lambda x: [unidecode(word, errors="preserve") for word in x])

        # Expand contractions
        results = results.apply(lambda x: [" ".join([contractions.fix(expanded_word) for expanded_word in word.split()]) for word in x])

        # Remove numbers
        results = results.apply(lambda x: [re.sub(r'\d+', '', word) for word in x])

        # Typos correction
        #results = results.apply(lambda x: [str(TextBlob(word).correct()) for word in x])


        # Remove punctuation except period
        results = results.apply(lambda x: [re.sub('[%s]' % re.escape(string.punctuation.replace('.', '')), '' , word) for word in x])


        # Remove double space
        results = results.apply(lambda x: [re.sub(' +', ' ', word) for word in x])


        # Lemmatization
        #results.apply(lambda x: print(x))
        results = results.apply(lambda x: [lemmatize_pos_tagged_text(word, lemmatizer, pos_tag_dict) for word in x])

        new_data[column] = results

    new_df = pd.DataFrame(new_data)

    return new_df