Using PyLDAvis to try to visualize the LDA model created using genism. Below is the code:
Libraries Imported
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
Lemmetization function:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
texts_out = []
for text in texts:
doc = nlp(text)
new_text = []
for token in doc:
if token.pos_ in allowed_postags:
new_text.append(token.lemma_)
final = " ".join(new_text)
texts_out.append(final)
return (texts_out)
lemmatized_texts = lemmatization(sentiment_df['rationale'])
print (lemmatized_texts[0][0:90])
Giving correct output: work well partner more solo worker especially early morning
Dictionary creation:
def gen_words(texts):
final = []
for text in texts:
new = gensim.utils.simple_preprocess(text, deacc=True)
final.append(new)
return (final)
data_words = gen_words(lemmatized_texts)
data_words = list(filter(None,data_words))
id2word = corpora.Dictionary(data_words)
corpus_new = []
for text in data_words:
new = id2word.doc2bow(text)
corpus_new.append(new)
print (corpus_new[2][0:20])
word = id2word[[2][:1][0]]
print (word)
Giving correct output: [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)] like
LDA model:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_new,
id2word=id2word,
num_topics=30,
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha="auto")
Visualization using PyLDAvis:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-40-f184c148aa16> in <cell line: 2>()
1 pyLDAvis.enable_notebook()
----> 2 pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
2 frames
/usr/local/lib/python3.10/dist-packages/gensim/matutils.py in corpus2csc(corpus, num_terms, dtype, num_docs, num_nnz, printprogress)
150
151 # zip(*doc) transforms doc to (token_indices, token_counts]
--> 152 doc_indices, doc_data = zip(*doc) if doc else ([], [])
153 indices.extend(doc_indices)
154 data.extend(doc_data)
ValueError: not enough values to unpack (expected 2, got 1)
Not able to pinpoint why this is happening, any help would be appreciated
Expected to see LDA visualization but got the above mentioned error