I'm trying to find out the textrank scores for each similarity matrix. Summarize function is defined to produce the summary. and the function is called for the list of lists of sentences i.e. result
but error is arising while ranking the sentences using PageRank algorithm. I tried debugging it by manually changing max_iter
value in PageRank function, error is still the same.
get_score function
It is called in the summarize function. error is arising inside this function.
def get_score(sim_mat):
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
score = nx.pagerank(nx_graph, max_iter=500)
return score
Summarize function Takes the raw text and returns summary
def summarize(text):
sentences = sent_tokenize(text)
t_clean_sentences = []
for i in range(len(sentences)):
obj = text_preprocessing(sentences[i])
j = obj.text_cleaner()
t_clean_sentences.append(j)
clean_sentences = []
for i in range(len(t_clean_sentences)):
a = gb.predict(vectorizer.transform([t_clean_sentences[i]]))
if a[0] != 'whQuestion' and a[0] != 'ynQuestion':
clean_sentences.append(t_clean_sentences[i])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
filtered_sentences = []
for i in range(len(clean_sentences)):
word_tokens = word_tokenize(clean_sentences[i])
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentences.append(" ".join(filtered_sentence))
filtered_sentences
import numpy as np
#sentence vectors
sentence_vectors = []
for i in filtered_sentences:
if len(i) != 0:
v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
else:
v = np.zeros((100,))
sentence_vectors.append(v)
from sklearn.metrics.pairwise import cosine_similarity
sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
for i in range(len(clean_sentences)):
for j in range(len(clean_sentences)):
if i != j:
sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
#pagerank scores
scores = get_score(sim_mat)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
# Specify number of sentences to form the summary
# Generate summary
summary = []
for i in range(len(ranked_sentences)):
summary.append(ranked_sentences[i][1].capitalize())
return summary
function call
Size of result
is 100
, when i tried it for first 50
list of sentences in result
it is working fine. then i made a system where the loop only summarize 50
list of sentences at a time and continues until reaches the size of result
but it is still showing the same error.
#text is the raw text from the TXT file
result = list(filter(lambda x : x != '', text.split(':')))
compiled = []
for r in result:
compiled.append(summarize(r))
Error
---------------------------------------------------------------------------
PowerIterationFailedConvergence Traceback (most recent call last)
<ipython-input-22-a04a4d4d0dfb> in <module>()
1 compiled = []
2 for r in range(len(result)):
----> 3 compiled.append(summarize(result[r]))
3 frames
<ipython-input-21-c7462482feb4> in summarize(text)
45
46 #pagerank scores
---> 47 scores = get_score(sim_mat)
48 ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
49 # Specify number of sentences to form the summary
<ipython-input-10-798a017cf041> in get_score(sim_mat)
2 import networkx as nx
3 nx_graph = nx.from_numpy_array(sim_mat)
----> 4 score = nx.pagerank(nx_graph)
5 return score
<decorator-gen-431> in pagerank(G, alpha, personalization, max_iter, tol, nstart, weight, dangling)
/usr/local/lib/python3.6/dist-packages/networkx/utils/decorators.py in _not_implemented_for(not_implement_for_func, *args, **kwargs)
80 raise nx.NetworkXNotImplemented(msg)
81 else:
---> 82 return not_implement_for_func(*args, **kwargs)
83 return _not_implemented_for
84
/usr/local/lib/python3.6/dist-packages/networkx/algorithms/link_analysis/pagerank_alg.py in pagerank(G, alpha, personalization, max_iter, tol, nstart, weight, dangling)
156 if err < N * tol:
157 return x
--> 158 raise nx.PowerIterationFailedConvergence(max_iter)
159
160
PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
I found the solution. I just used
nx.pagerank_numpy(nx_graph)
instead ofnx.pagerank(nx_graph)
. This solved the problem, as the graph and the similarity matrix I was using was in the form ofnx_graph = nx.from_numpy_array(sim_mat)
.