How to lemmatize pos tagged column in dataframe

81 Views Asked by At

I have a Dataframe of some tweets about the Russia-Ukraine conflict and I have pos_tagged the tweets after cleaning and want to lemmatize postagged column. My code returns only the first pos_tagged word as a lemma. How can I correctly lemmatize the pos_tagged column?

Function to pos tag cleaned tweets:

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

df['POS_tagged'] = df['cleanedTweets'].apply(token_stop_pos)

Function to lemmatize df['POS_tagged'] column:

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
      if not pos:
        lemma = word
        lemma_rew = lemma_rew + " " + lemma
      else:
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        lemma_rew = lemma_rew + " " + lemma
      return lemma_rew

df['Lemma'] = df['POS_tagged'].apply(lemmatize)

Output

Data after lemmatizing

0

There are 0 best solutions below