BERTopic 终止工作进程错误:由执行器管理的工作人员进程意外终止,这可能是由于在调用函数时发生段错误或由于过度内存使用导致操作系统杀死工作人员进程所致,工作人员的退出代码为{EXIT(1)},

ymdaylpp  于 5个月前  发布在  其他
关注(0)|答案(3)|浏览(47)

你好,
我正在使用BERTopic对10k个句子进行运行,它运行正常,但是如果我在15k个句子上运行时,会出现以下错误(我至少需要运行50k个句子):

TerminatedWorkerError:A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {EXIT(1)}
Open Traceback
---------------------------------------------------------------------------
TerminatedWorkerError                     Traceback (most recent call last)
Cell In[38], line 13
      8     topic_model = BERTopic(
      9         embedding_model="thenlper/gte-small", 
     10         min_topic_size=50,
     11         representation_model=KeyBERTInspired()
     12     )
---> 13     topics, _ = topic_model.fit_transform(docs)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/bertopic/_bertopic.py:411, in BERTopic.fit_transform(self, documents, embeddings, images, y)
    408 umap_embeddings = self._reduce_dimensionality(embeddings, y)
    410 # Cluster reduced embeddings
--> 411 documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)
    413 # Sort and Map Topic IDs by their frequency
    414 if not self.nr_topics:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/bertopic/_bertopic.py:3390, in BERTopic._cluster_embeddings(self, umap_embeddings, documents, partial_fit, y)
   3388 else:
   3389     try:
-> 3390         self.hdbscan_model.fit(umap_embeddings, y=y)
   3391     except TypeError:
   3392         self.hdbscan_model.fit(umap_embeddings)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:1190, in HDBSCAN.fit(self, X, y)
   1180 kwargs.pop("prediction_data", None)
   1181 kwargs.update(self._metric_kwargs)
   1183 (
   1184     self.labels_,
   1185     self.probabilities_,
   1186     self.cluster_persistence_,
   1187     self._condensed_tree,
   1188     self._single_linkage_tree,
   1189     self._min_spanning_tree,
-> 1190 ) = hdbscan(clean_data, **kwargs)
   1192 if self.metric != "precomputed" and not self._all_finite:
   1193     # remap indices to align with original data in the case of non-finite entries.
   1194     self._condensed_tree = remap_condensed_tree(
   1195         self._condensed_tree, internal_to_raw, outliers
   1196     )
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:822, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
    809         (single_linkage_tree, result_min_span_tree) = memory.cache(
    810             _hdbscan_prims_kdtree
    811         )(
   (...)
    819             **kwargs
    820         )
    821     else:
--> 822         (single_linkage_tree, result_min_span_tree) = memory.cache(
    823             _hdbscan_boruvka_kdtree
    824         )(
    825             X,
    826             min_samples,
    827             alpha,
    828             metric,
    829             p,
    830             leaf_size,
    831             approx_min_span_tree,
    832             gen_min_span_tree,
    833             core_dist_n_jobs,
    834             **kwargs
    835         )
    836 else:  # Metric is a valid BallTree metric
    837     # TO DO: Need heuristic to decide when to go to boruvka;
    838     # still debugging for now
    839     if X.shape[1] > 60:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/memory.py:353, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    352 def __call__(self, *args, **kwargs):
--> 353     return self.func(*args, **kwargs)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/hdbscan_.py:325, in _hdbscan_boruvka_kdtree(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs)
    322     X = X.astype(np.float64)
    324 tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
--> 325 alg = KDTreeBoruvkaAlgorithm(
    326     tree,
    327     min_samples,
    328     metric=metric,
    329     leaf_size=leaf_size // 3,
    330     approx_min_span_tree=approx_min_span_tree,
    331     n_jobs=core_dist_n_jobs,
    332     **kwargs
    333 )
    334 min_spanning_tree = alg.spanning_tree()
    335 # Sort edges of the min_spanning_tree by weight
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/_hdbscan_boruvka.pyx:392, in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm.__init__()
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/hdbscan/_hdbscan_boruvka.pyx:426, in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm._compute_bounds()
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1952, in Parallel.__call__(self, iterable)
   1946 # The first item from the output is blank, but it makes the interpreter
   1947 # progress until it enters the Try/Except block of the generator and
   1948 # reach the first `yield` statement. This starts the aynchronous
   1949 # dispatch of the tasks to the workers.
   1950 next(output)
-> 1952 return output if self.return_generator else list(output)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1592     yield
   1594     with self._backend.retrieval_context():
-> 1595         yield from self._retrieve()
   1597 except GeneratorExit:
   1598     # The generator has been garbage collected before being fully
   1599     # consumed. This aborts the remaining tasks if possible and warn
   1600     # the user if necessary.
   1601     self._exception = True
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1699, in Parallel._retrieve(self)
   1692 while self._wait_retrieval():
   1693 
   1694     # If the callback thread of a worker has signaled that its task
   1695     # triggered an exception, or if the retrieval loop has raised an
   1696     # exception (e.g. `GeneratorExit`), exit the loop and surface the
   1697     # worker traceback.
   1698     if self._aborting:
-> 1699         self._raise_error_fast()
   1700         break
   1702     # If the next job is not ready for retrieval yet, we just wait for
   1703     # async callbacks to progress.
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:1734, in Parallel._raise_error_fast(self)
   1730 # If this error job exists, immediatly raise the error by
   1731 # calling get_result. This job might not exists if abort has been
   1732 # called directly or if the generator is gc'ed.
   1733 if error_job is not None:
-> 1734     error_job.get_result(self.timeout)
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout)
    730 backend = self.parallel._backend
    732 if backend.supports_retrieve_callback:
    733     # We assume that the result has already been retrieved by the
    734     # callback thread, and is stored internally. It's just waiting to
    735     # be returned.
--> 736     return self._return_or_raise()
    738 # For other backends, the main thread needs to run the retrieval step.
    739 try:
File /mnt/xarfuse/uid-564347/e4f2f620-seed-nspid4026531836_cgpid15010019-ns-4026531841/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self)
    752 try:
    753     if self.status == TASK_ERROR:
--> 754         raise self._result
    755     return self._result
    756 finally:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {EXIT(1)}

有没有办法在已经拟合的主题模型上添加更多的文档?谢谢!

t1rydlwq

t1rydlwq1#

使用这些文档应该可行,因为10K通常不是很多。这似乎与HDBSCAN有关,但我以前没有遇到过这个问题。您有多少可用的RAM?另外,从一个干净的环境开始,重新安装所有软件包可能会有所帮助。

vvppvyoh

vvppvyoh2#

感谢您的回复!已安装干净,但仍然显示错误。可用RAM:217231130624
我并没有使用HDBSCAN,只是运行以下代码:

topic_model = BERTopic(
        embedding_model="thenlper/gte-small", 
        min_topic_size=50,
        representation_model=KeyBERTInspired()
    )
    topics, _ = topic_model.fit_transform(docs)

好奇是否有办法继续向拟合的主题模型添加内容?

ss2ws0br

ss2ws0br3#

感谢您的回复!已安装干净,但仍然显示错误。可用RAM:217231130624
这个数字是多少GB?另外,从一个完全干净的环境开始,安装最新版本的BERTopic可能会有助于防止之前安装的包出现任何问题。
好奇是否有办法继续向拟合的主题模型添加内容?
有,您可以使用online topic modeling.merge_models功能。

相关问题