我一直在尝试运行这段代码,当它到达dist_reduced = dist_mapped.reduce(reducer)
时,它抛出一个Py4JJavaError。我在3台不同的机器上尝试过,我得到了同样的错误。Java version "1.8.0_202", Python 3.12.1, PySpark 3.5.0
个
代码
import pandas as pd
from collections import Counter
from util import *
from pyspark import SparkConf, SparkContext
cols = ['sentiment', 'id', 'date', 'query_string', 'user', 'text']
df = pd.read_csv('file.csv', encoding='latin1', names=cols)
data = df['text'].tolist()
conf = SparkConf().setAppName('MapReduce').setMaster('local[4]')
sc = SparkContext(conf=conf)
dist = sc.parallelize(data)
def mapper(text):
words = text.split()
cleaned = map(clean_word, words)
filtered = filter(word_not_in_stopwords, cleaned)
return Counter(filtered)
def reducer(cnt1, cnt2):
cnt1.update(cnt2)
return cnt1
dist_mapped = dist.map(mapper)
dist_reduced = dist_mapped.reduce(reducer)
dist_data_reduced.most_common(5)
字符串
追踪
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[10], line 2
1 dist_mapped = dist.map(mapper)
----> 2 dist_reduced = dist_mapped.reduce(reducer)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\rdd.py:1924, in RDD.reduce(self, f)
1921 return
1922 yield reduce(f, iterator, initial)
-> 1924 vals = self.mapPartitions(func).collect()
1925 if vals:
1926 return reduce(f, vals)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\rdd.py:1833, in RDD.collect(self)
1831 with SCCallSiteSync(self.context):
1832 assert self.ctx._jvm is not None
-> 1833 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
1834 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 0.0 failed 1 times, most recent failure: Lost task 2.0 in stage 0.0 (TID 2) (Corsair executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, The system cannot find the file specified
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
...
型
系统变量
HADOOP_HOME = C:\hadoop
个SPARK_HOME = %USERPROFILE%\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark
个Path = %HADOOP_HOME%\bin
个Path = %SPARK_HOME%\bin
个
用户变量
JDK_HOME = %JAVA_HOME%
个JRE_HOME = %JAVA_HOME%\jre
个Path = %JAVA_HOME%\lib
个Path = %JAVA_HOME%\jre\lib
1条答案
按热度按时间i34xakig1#
从跟踪日志中可以清楚地看到,它无法找到python3
检查运行Corsair executor驱动程序的系统上是否安装了Python 3。使用
python3 --version
或which python3
进行确认。或
确保包含python3可执行文件的目录包含在
PATH
环境变量中。