I'm trying to train a top2vec model and come up against either the issue of not having enough documents which I rectify by concatenating the dataframe with itself etc.
Then upon training the model the Type Error comes up. I can't find where the float is coming from to further investigate how to change it.
The data consists of 2 columns of string data
# Increase number of docs
data = [df, df, df, df, df]
df_extra = pd.concat(data)
print(len(df_extra))
# Train the top2vec model
model = Top2Vec(df_extra.values, embedding_model='universal-sentence-encoder') #, embedding_model='universal-sentence-encoder'
The model runs until it hits Finding dense areas of document then throws the error. Traceback shows possibly an issue with the clustering when it hits dbscan?
TypeError Traceback (most recent call last)
Cell In[84], line 2
1 # Train the top2vec model
----> 2 model = Top2Vec(df_extra.values, embedding_model='universal-sentence-encoder') #, embedding_model='universal-sentence-encoder'
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:666, in Top2Vec.__init__(self, documents, min_count, topic_merge_delta, ngram_vocab, ngram_vocab_args, embedding_model, embedding_model_path, embedding_batch_size, split_documents, document_chunker, chunk_length, max_num_chunks, chunk_overlap_ratio, chunk_len_coverage_ratio, sentencizer, speed, use_corpus_file, document_ids, keep_documents, workers, tokenizer, use_embedding_model_tokenizer, umap_args, hdbscan_args, verbose)
663 else:
664 raise ValueError(f"{embedding_model} is an invalid embedding model.")
--> 666 self.compute_topics(umap_args=umap_args, hdbscan_args=hdbscan_args, topic_merge_delta=topic_merge_delta)
668 # initialize document indexing variables
669 self.document_index = None
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:1266, in Top2Vec.compute_topics(self, umap_args, hdbscan_args, topic_merge_delta)
1261 if hdbscan_args is None:
1262 hdbscan_args = {'min_cluster_size': 15,
1263 'metric': 'euclidean',
1264 'cluster_selection_method': 'eom'}
-> 1266 cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
1268 # calculate topic vectors from dense areas of documents
1269 logger.info('Finding topics')
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:1205, in HDBSCAN.fit(self, X, y)
1195 kwargs.pop("prediction_data", None)
1196 kwargs.update(self._metric_kwargs)
1198 (
1199 self.labels_,
1200 self.probabilities_,
1201 self.cluster_persistence_,
1202 self._condensed_tree,
1203 self._single_linkage_tree,
1204 self._min_spanning_tree,
-> 1205 ) = hdbscan(clean_data, **kwargs)
1207 if self.metric != "precomputed" and not self._all_finite:
1208 # remap indices to align with original data in the case of non-finite entries.
1209 self._condensed_tree = remap_condensed_tree(
1210 self._condensed_tree, internal_to_raw, outliers
1211 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:884, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
867 else:
868 (single_linkage_tree, result_min_span_tree) = memory.cache(
869 _hdbscan_boruvka_balltree
870 )(
(...)
880 **kwargs
881 )
883 return (
--> 884 _tree_to_labels(
885 X,
886 single_linkage_tree,
887 min_cluster_size,
888 cluster_selection_method,
889 allow_single_cluster,
890 match_reference_implementation,
891 cluster_selection_epsilon,
892 max_cluster_size,
893 )
894 + (result_min_span_tree,)
895 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:80, in _tree_to_labels(X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size)
78 condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
79 stability_dict = compute_stability(condensed_tree)
---> 80 labels, probabilities, stabilities = get_clusters(
81 condensed_tree,
82 stability_dict,
83 cluster_selection_method,
84 allow_single_cluster,
85 match_reference_implementation,
86 cluster_selection_epsilon,
87 max_cluster_size,
88 )
90 return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
File hdbscan/_hdbscan_tree.pyx:659, in hdbscan._hdbscan_tree.get_clusters()
File hdbscan/_hdbscan_tree.pyx:733, in hdbscan._hdbscan_tree.get_clusters()
TypeError: 'numpy.float64' object cannot be interpreted as an integer
I've read the other questions relating to this type error but not specific to top2vec so it's not clear where in this algorithm the float is emerging where it shouldn't. Any guidance appreciated...Could multiplying the documents still be leaving too little topics and thus resulting in a float perhaps?
The problem apparently is not with top2vec specifically but a combination of numpy, hdbscan and scikit-learn versions.
To resolve this issue, use the following versions: