I am now using NMF to generate topics. My code is shown below. However, I do not know how to get the frequency of each topic. Does anyone that can help me? Thank you!
def fit_tfidf(documents):
tfidf = TfidfVectorizer(input = 'content', stop_words = 'english',
use_idf = True, ngram_range = NGRAM_RANGE,lowercase = True, max_features = MAX_FEATURES, min_df = 1 )
tfidf_matrix = tfidf.fit_transform(documents.values).toarray()
tfidf_feature_names = np.array(tfidf.get_feature_names())
tfidf_reverse_lookup = {word: idx for idx, word in enumerate(tfidf_feature_names)}
return tfidf_matrix, tfidf_reverse_lookup, tfidf_feature_names
def vectorization(documments):
if VECTORIZER == 'tfidf':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_tfidf(documents)
if VECTORIZER == 'bow':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_bow(documents)
return vec_matrix, vec_reverse_lookup, vec_feature_names
def nmf_model(vec_matrix, vec_reverse_lookup, vec_feature_names, NUM_TOPICS):
topic_words = []
nmf = NMF(n_components = NUM_TOPICS, random_state=3).fit(vec_matrix)
for topic in nmf.components_:
word_idx = np.argsort(topic)[::-1][0:N_TOPIC_WORDS]
topic_words.append([vec_feature_names[i] for i in word_idx])
return topic_words
If you mean the frequency of each topic inside each documents, then:
H is a matrix of shape (n_documents, n_topics). Each row represents a document vector (in the topic space). In this vector you find the weight that each topic has (which translates as the topic importance).