pyspark 无法将spark应用程序提交到docker compose

wgeznvg7  于 2024-01-06  发布在  Spark
关注(0)|答案(1)|浏览(191)

我尝试在docker-compose上提交一个小的spark应用程序到spark集群。但是,它总是失败。下面是我在docker-compose上的spark集群(我已经有一些Kafka代理忽略它):

  1. version: '3'
  2. services:
  3. spark-master:
  4. image: bitnami/spark:3.5
  5. container_name: spark-master
  6. hostname: spark-master
  7. environment:
  8. - SPARK_MODE=master
  9. - SPARK_MASTER_HOSTNAME=spark-master
  10. - SPARK_MASTER_PORT=7077
  11. - PYSPARK_PYTHON=/opt/bitnami/python/bin/python3
  12. ports:
  13. - "4040:4040"
  14. - "6066:6066"
  15. - "7077:7077"
  16. - "8080:8080"
  17. deploy:
  18. resources:
  19. limits:
  20. cpus: "1" # Adjust as needed
  21. memory: 1g # Adjust as needed
  22. # command: bin/spark-class org.apache.spark.deploy.master.Master
  23. networks:
  24. - spark-network
  25. spark-worker-1:
  26. image: bitnami/spark:3.5
  27. container_name: spark-worker-1
  28. environment:
  29. - SPARK_MODE=worker
  30. - SPARK_MASTER_URL=spark://spark-master:7077
  31. - SPARK_WORKER_MEMORY = 3g
  32. - SPARK_WORKER_CORES = 2
  33. - PYSPARK_PYTHON=/opt/bitnami/python/bin/python3
  34. deploy:
  35. resources:
  36. limits:
  37. cpus: "2" # Adjust as needed
  38. memory: 4G # Adjust as needed
  39. depends_on:
  40. - spark-master
  41. networks:
  42. - spark-network
  43. spark-worker-2:
  44. image: bitnami/spark:3.5
  45. container_name: spark-worker-2
  46. environment:
  47. - SPARK_MODE=worker
  48. - SPARK_MASTER_URL=spark://spark-master:7077
  49. - SPARK_WORKER_MEMORY = 3g
  50. - SPARK_WORKER_CORES = 2
  51. - PYSPARK_PYTHON=/opt/bitnami/python/bin/python3
  52. deploy:
  53. resources:
  54. limits:
  55. cpus: "2" # Adjust as needed
  56. memory: 4G # Adjust as needed
  57. depends_on:
  58. - spark-master
  59. networks:
  60. - spark-network
  61. zookeeper:
  62. image: bitnami/zookeeper:3.9
  63. container_name: zookeeper-server
  64. restart: always
  65. ports:
  66. - "2181:2181"
  67. environment:
  68. - ALLOW_ANONYMOUS_LOGIN=yes
  69. kafka1:
  70. image: bitnami/kafka:3.5
  71. container_name: broker-1
  72. hostname: broker-1
  73. ports:
  74. - "9093:9093"
  75. environment:
  76. - KAFKA_BROKER_ID=1
  77. - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
  78. - KAFKA_CFG_LISTENERS=PLAINTEXT://:9093,INTERNAL://:9092
  79. - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9093,INTERNAL://:9092
  80. - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,INTERNAL:PLAINTEXT
  81. - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
  82. depends_on:
  83. - zookeeper
  84. kafka2:
  85. image: bitnami/kafka:3.5
  86. container_name: broker-2
  87. hostname: broker-2
  88. ports:
  89. - "9094:9094"
  90. environment:
  91. - KAFKA_BROKER_ID=2
  92. - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
  93. - KAFKA_CFG_LISTENERS=PLAINTEXT://:9094,INTERNAL://:9092
  94. - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9094,INTERNAL://:9092
  95. - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,INTERNAL:PLAINTEXT
  96. - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
  97. depends_on:
  98. - zookeeper
  99. kafka3:
  100. image: bitnami/kafka:3.5
  101. container_name: broker-3
  102. hostname: broker-3
  103. ports:
  104. - "9095:9095"
  105. environment:
  106. - KAFKA_BROKER_ID=3
  107. - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
  108. - KAFKA_CFG_LISTENERS=PLAINTEXT://:9095,INTERNAL://:9092
  109. - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9095,INTERNAL://:9092
  110. - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,INTERNAL:PLAINTEXT
  111. - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
  112. depends_on:
  113. - zookeeper
  114. networks:
  115. spark-network:
  116. driver: bridge

字符串
简单的Spark App:

  1. from pyspark.sql import SparkSession
  2. from pyspark.sql.functions import *
  3. # Create a SparkSession
  4. spark = SparkSession.builder \
  5. .appName("My App") \
  6. .master("spark://172.28.240.1:7077") \
  7. .getOrCreate()
  8. rdd = spark.sparkContext.parallelize(range(1, 100))
  9. print("THE SUM IS HERE: ", rdd.sum())
  10. # Stop the SparkSession
  11. spark.stop()


当我提交时,它会引发错误:

  1. 23/12/08 01:44:54 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (172.20.0.2 executor 0): java.io.IOException: Cannot run program "C:\Program Files\Python310\python.exe": error=2, No such file or directory
  2. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
  3. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
  4. at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:239)
  5. at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:139)
  6. at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
  7. at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
  8. at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
  9. at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
  10. at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
  11. at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
  12. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
  13. at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
  14. at org.apache.spark.scheduler.Task.run(Task.scala:141)
  15. at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
  16. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
  17. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
  18. at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
  19. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
  20. at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
  21. at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
  22. at java.base/java.lang.Thread.run(Thread.java:840)
  23. Caused by: java.io.IOException: error=2, No such file or directory
  24. at java.base/java.lang.ProcessImpl.forkAndExec(Native Method)
  25. at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:314)
  26. at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:244)
  27. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
  28. ... 20 more
  29. 23/12/08 01:44:54 ERROR TaskSetManager: Task 0 in stage 0.0 failed 4 times; aborting job
  30. Traceback (most recent call last):
  31. File "D:\IT4931-bigdata-Movie-processing\test_submit.py", line 12, in <module>
  32. print("THE SUM IS HERE: ", rdd.sum())
  33. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\pyspark\rdd.py", line 2291, in sum
  34. return self.mapPartitions(lambda x: [sum(x)]).fold( # type: ignore[return-value]
  35. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\pyspark\rdd.py", line 2044, in fold
  36. vals = self.mapPartitions(func).collect()
  37. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\pyspark\rdd.py", line 1833, in collect
  38. sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  39. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\py4j\java_gateway.py", line 1322, in __call__
  40. return_value = get_return_value(
  41. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\pyspark\errors\exceptions\captured.py", line 179, in deco
  42. return f(*a, **kw)
  43. File "C:\Users\ABC\AppData\Roaming\Python\Python310\site-packages\py4j\protocol.py", line 326, in get_return_value
  44. raise Py4JJavaError(
  45. py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
  46. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 5) (172.20.0.2 executor 0): java.io.IOException: Cannot run program "C:\Program Files\Python310\python.exe": error=2, No such file or directory
  47. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
  48. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
  49. at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:239)
  50. at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:139)
  51. at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
  52. at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
  53. at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
  54. at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
  55. at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
  56. at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
  57. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
  58. at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
  59. at org.apache.spark.scheduler.Task.run(Task.scala:141)
  60. at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
  61. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
  62. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
  63. at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
  64. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
  65. at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
  66. at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
  67. at java.base/java.lang.Thread.run(Thread.java:840)
  68. Caused by: java.io.IOException: error=2, No such file or directory
  69. at java.base/java.lang.ProcessImpl.forkAndExec(Native Method)
  70. at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:314)
  71. at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:244)
  72. at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
  73. ... 20 more
  74. Driver stacktrace:
  75. at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
  76. at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
  77. at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
  78. at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
  79. at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
  80. at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
  81. at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
  82. at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
  83. at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
  84. at scala.Option.foreach(Option.scala:407)
  85. at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
  86. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
  87. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
  88. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
  89. at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
  90. at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
  91. at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
  92. at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
  93. at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
  94. at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
  95. at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1046)
  96. at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  97. at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  98. at org.apache.spark.rdd.RDD.withScope(RDD.scala:407)
  99. at org.apache.spark.rdd.RDD.collect(RDD.scala:1045)
  100. at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
  101. at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
  102. at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
  103. at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
  104. at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  105. at java.lang.reflect.Method.invoke(Method.java:498)
  106. at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
  107. at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
  108. at py4j.Gateway.invoke(Gateway.java:282)
  109. at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
  110. at py4j.commands.CallCommand.execute(CallCommand.java:79)
  111. at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
  112. at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
  113. at java.lang.Thread.run(Thread.java:748)
  114. Caused by: java.io.IOException: Cannot run program "C:\Program Files\Python310\python.exe": error=2, No such file or directory
  115. at java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
  116. at java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
  117. at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:239)
  118. at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:139)
  119. at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
  120. at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
  121. at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
  122. at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
  123. at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
  124. at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
  125. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
  126. at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
  127. at org.apache.spark.scheduler.Task.run(Task.scala:141)
  128. at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
  129. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
  130. at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
  131. at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
  132. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
  133. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
  134. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
  135. at java.lang.Thread.run(Thread.java:840)
  136. Caused by: java.io.IOException: error=2, No such file or directory
  137. at java.lang.ProcessImpl.forkAndExec(Native Method)
  138. at java.lang.ProcessImpl.<init>(ProcessImpl.java:314)
  139. at java.lang.ProcessImpl.start(ProcessImpl.java:244)
  140. at java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
  141. ... 20 more


我只是一个初学者在Spark。我已经试图修复它一整天,仍然有错误。有人可以帮助我,请?谢谢。

dgtucam1

dgtucam11#

看起来这个问题与Spark无法找到Python可执行文件(C:\Program Files\Python310\python.exe)有关。这可能是由于该路径特定于Windows环境,但您的Spark集群运行在Dockerized Linux环境中。
1.更新Spark应用程序中的Python路径,并使用127.0.0.1代替spark-master

  1. spark = SparkSession.builder \
  2. .appName("My App") \
  3. .master("spark://127.0.0.1:7077") \
  4. .config("spark.executorEnv.PYSPARK_PYTHON", "/opt/bitnami/python/bin/python3") \
  5. .getOrCreate()

字符串
1.使用spark-submit提交Spark申请:

  1. spark-submit --master spark://127.0.0.1:7077 spark_app.py


1.使用http://localhost:8080上提供的Web UI监控Spark应用程序

展开查看全部

相关问题