I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.
When I try to visualise my LSI topics I get error messages as shown below.
The code to create the models is below:
# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()
# Change sentences to words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=10,
update_every=1,
chunksize=100,
passes=5,
alpha='auto',
per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)
lsi_model = gensim.models.lsimodel.LsiModel(
corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
print("")
print("LSI Model")
# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]
x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)
The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.
#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
corpus_selected = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_selected):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
This is the error message produced:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
2
3 # create bar graph of topic frequency
4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_selected):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
I also tried with pyLDAvis, however, this also produced an error.
#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz
This produced the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
5
6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
8 lsi_viz
1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
47 gamma = topic_model.inference(corpus)
48 else:
---> 49 gamma, _ = topic_model.inference(corpus)
50 doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
51 else:
AttributeError: 'LsiModel' object has no attribute 'inference'
I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.
Found the answer :)