Unable to produce visualisations to calculate topic frequency for LSI model

304 Views Asked by At

I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.

When I try to visualise my LSI topics I get error messages as shown below.

The code to create the models is below:

# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()


# Change sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))

# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)

nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])


# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=10,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")

# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
        words_together= " ".join(words)
        words_together_list.append(words_together)
        name = name + 1
        print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)

lsi_model = gensim.models.lsimodel.LsiModel(
   corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
   
print("")
print("LSI Model")

# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]

x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
        words_together= " ".join(words)
        words_together_list.append(words_together)
        name = name + 1
        print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)

The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.

#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
    corpus_selected = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_selected):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)       

dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)            

# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
         xlabel='Topic', figsize=(6, 5))

This is the error message produced:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
      2 
      3 # create bar graph of topic frequency
      4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
      5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()

<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
      5     topic_percentages = []
      6     for i, corp in enumerate(corpus_selected):
----> 7         topic_percs, wordid_topics, wordid_phivalues = model[corp]
      8         dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
      9         dominant_topics.append((i, dominant_topic))

ValueError: too many values to unpack (expected 3)

I also tried with pyLDAvis, however, this also produced an error.

#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz

This produced the following error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
      5 
      6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
      8 lsi_viz

1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
     47             gamma = topic_model.inference(corpus)
     48         else:
---> 49             gamma, _ = topic_model.inference(corpus)
     50         doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
     51     else:

AttributeError: 'LsiModel' object has no attribute 'inference'

I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.

1

There are 1 best solutions below

2
On

Found the answer :)

sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(lsi_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lsi_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

# Format
df_dominant_topic =sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
dominant_topic_in_each_doc = df_dominant_topic.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
         xlabel='Topic', figsize=(6, 5))