I am trying to visualize the result of a topic model (built using gensim LDA model) with PyLDAvis using this tutorial but I keep getting this famous error
index 11588 is out of bounds for axis 1 with size 11588
I tried to search all over stackoverflow and GitHub and I've found that many people had this problem before but in older version am currently using PyLDAvis version 3.2.2 ( I tried the latest one but in vain)
I am new to python and machine learning, so I couldn't debug the problem Any help or guidance would be much appreciated
This is my jupyter notebook code :
#tokenize, remove stopwords, non-alphabetic words, lowercase
filename = 'booksummaries.txt'
tokenized_docs_summaries = []
for line in open(filename, encoding="utf-8"):
temp = line.split("\t")
# print(preprocess(temp[6]))
tokenized_docs_summaries.append(preprocess(temp[6]))
dictionary = Dictionary(tokenized_docs_summaries)
# os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'})
# #You should update this path as per the path of Mallet directory on your system.
# mallet_path = r'C:/mallet-2.0.8/bin/mallet'
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet' # you should NOT need to change this
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)
# Show Topics
pprint(ldamallet.show_topics(formatted=False))
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=tokenized_docs_summaries, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=tokenized_docs_summaries, start=2, limit=40, step=4)
optimal_model = model_list[best_result_index]
# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
def convertldaGenToldaMallet(mallet_model):
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, eta=0,
)
model_gensim.state.sstats[...] = mallet_model.wordtopics
model_gensim.sync_state()
return model_gensim
optimal_model = convertldaGenToldaMallet(optimal_model)
#Creating Topic Distance Visualization
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(optimal_model, corpus, dictionary)
p