你好,
我正在使用BERTopic对10k个句子进行运行,它运行正常,但是如果我在15k个句子上运行时,会出现以下错误(我至少需要运行50k个句子):
TerminatedWorkerError:A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {EXIT(1)}
Open Traceback
---------------------------------------------------------------------------
TerminatedWorkerError Traceback (most recent call last)
Cell In[38], line 13
8 topic_model = BERTopic(
9 embedding_model="thenlper/gte-small",
10 min_topic_size=50,
11 representation_model=KeyBERTInspired()
12 )
---> 13 topics, _ = topic_model.fit_transform(docs)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/bertopic/_bertopic.py:411, in BERTopic.fit_transform(self, documents, embeddings, images, y)
408 umap_embeddings = self._reduce_dimensionality(embeddings, y)
410 # Cluster reduced embeddings
--> 411 documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)
413 # Sort and Map Topic IDs by their frequency
414 if not self.nr_topics:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/bertopic/_bertopic.py:3390, in BERTopic._cluster_embeddings(self, umap_embeddings, documents, partial_fit, y)
3388 else:
3389 try:
-> 3390 self.hdbscan_model.fit(umap_embeddings, y=y)
3391 except TypeError:
3392 self.hdbscan_model.fit(umap_embeddings)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:1190, in HDBSCAN.fit(self, X, y)
1180 kwargs.pop("prediction_data", None)
1181 kwargs.update(self._metric_kwargs)
1183 (
1184 self.labels_,
1185 self.probabilities_,
1186 self.cluster_persistence_,
1187 self._condensed_tree,
1188 self._single_linkage_tree,
1189 self._min_spanning_tree,
-> 1190 ) = hdbscan(clean_data, **kwargs)
1192 if self.metric != "precomputed" and not self._all_finite:
1193 # remap indices to align with original data in the case of non-finite entries.
1194 self._condensed_tree = remap_condensed_tree(
1195 self._condensed_tree, internal_to_raw, outliers
1196 )
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:822, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
809 (single_linkage_tree, result_min_span_tree) = memory.cache(
810 _hdbscan_prims_kdtree
811 )(
(...)
819 **kwargs
820 )
821 else:
--> 822 (single_linkage_tree, result_min_span_tree) = memory.cache(
823 _hdbscan_boruvka_kdtree
824 )(
825 X,
826 min_samples,
827 alpha,
828 metric,
829 p,
830 leaf_size,
831 approx_min_span_tree,
832 gen_min_span_tree,
833 core_dist_n_jobs,
834 **kwargs
835 )
836 else: # Metric is a valid BallTree metric
837 # TO DO: Need heuristic to decide when to go to boruvka;
838 # still debugging for now
839 if X.shape[1] > 60:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/memory.py:353, in NotMemorizedFunc.__call__(self, *args, **kwargs)
352 def __call__(self, *args, **kwargs):
--> 353 return self.func(*args, **kwargs)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:325, in _hdbscan_boruvka_kdtree(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs)
322 X = X.astype(np.float64)
324 tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
--> 325 alg = KDTreeBoruvkaAlgorithm(
326 tree,
327 min_samples,
328 metric=metric,
329 leaf_size=leaf_size // 3,
330 approx_min_span_tree=approx_min_span_tree,
331 n_jobs=core_dist_n_jobs,
332 **kwargs
333 )
334 min_spanning_tree = alg.spanning_tree()
335 # Sort edges of the min_spanning_tree by weight
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/_hdbscan_boruvka.pyx:392, in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm.__init__()
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/_hdbscan_boruvka.pyx:426, in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm._compute_bounds()
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1952, in Parallel.__call__(self, iterable)
1946 # The first item from the output is blank, but it makes the interpreter
1947 # progress until it enters the Try/Except block of the generator and
1948 # reach the first `yield` statement. This starts the aynchronous
1949 # dispatch of the tasks to the workers.
1950 next(output)
-> 1952 return output if self.return_generator else list(output)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
1592 yield
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1597 except GeneratorExit:
1598 # The generator has been garbage collected before being fully
1599 # consumed. This aborts the remaining tasks if possible and warn
1600 # the user if necessary.
1601 self._exception = True
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1699, in Parallel._retrieve(self)
1692 while self._wait_retrieval():
1693
1694 # If the callback thread of a worker has signaled that its task
1695 # triggered an exception, or if the retrieval loop has raised an
1696 # exception (e.g. `GeneratorExit`), exit the loop and surface the
1697 # worker traceback.
1698 if self._aborting:
-> 1699 self._raise_error_fast()
1700 break
1702 # If the next job is not ready for retrieval yet, we just wait for
1703 # async callbacks to progress.
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1734, in Parallel._raise_error_fast(self)
1730 # If this error job exists, immediatly raise the error by
1731 # calling get_result. This job might not exists if abort has been
1732 # called directly or if the generator is gc'ed.
1733 if error_job is not None:
-> 1734 error_job.get_result(self.timeout)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout)
730 backend = self.parallel._backend
732 if backend.supports_retrieve_callback:
733 # We assume that the result has already been retrieved by the
734 # callback thread, and is stored internally. It's just waiting to
735 # be returned.
--> 736 return self._return_or_raise()
738 # For other backends, the main thread needs to run the retrieval step.
739 try:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self)
752 try:
753 if self.status == TASK_ERROR:
--> 754 raise self._result
755 return self._result
756 finally:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {EXIT(1)}
有没有办法在已经拟合的主题模型上添加更多的文档?谢谢!
3条答案
按热度按时间t1rydlwq1#
使用这些文档应该可行,因为10K通常不是很多。这似乎与HDBSCAN有关,但我以前没有遇到过这个问题。您有多少可用的RAM?另外,从一个干净的环境开始,重新安装所有软件包可能会有所帮助。
vvppvyoh2#
感谢您的回复!已安装干净,但仍然显示错误。可用RAM:217231130624
我并没有使用HDBSCAN,只是运行以下代码:
好奇是否有办法继续向拟合的主题模型添加内容?
ss2ws0br3#
感谢您的回复!已安装干净,但仍然显示错误。可用RAM:217231130624
这个数字是多少GB?另外,从一个完全干净的环境开始,安装最新版本的BERTopic可能会有助于防止之前安装的包出现任何问题。
好奇是否有办法继续向拟合的主题模型添加内容?
有,您可以使用online topic modeling或
.merge_models
功能。