from pyspark import SparkContext, SparkFiles
import os
import glob
if __name__ == '__main__':
sc = SparkContext()
# List all wheel files in SparkFiles root directory
sparkfiles_dir = SparkFiles.getRootDirectory()
wheel_files_with_path = glob.glob(os.path.join(sparkfiles_dir, '*.whl'))
# Get only the file names
wheel_files = [os.path.basename(file) for file in wheel_files_with_path]
print("wheel_files:",wheel_files)
import os
import glob
from zipfile import ZipFile
from pyspark import SparkContext, SparkFiles
if __name__ == '__main__':
sc = SparkContext()
# List all wheel files in SparkFiles root directory
sparkfiles_dir = SparkFiles.getRootDirectory()
wheel_files_with_path = glob.glob(os.path.join(sparkfiles_dir, '*.whl'))
for wheel_file in wheel_files_with_path:
print(f"Listing contents of {os.path.basename(wheel_file)}:")
# Open the wheel file as a ZIP archive and list its contents
with ZipFile(wheel_file, 'r') as zip_ref:
for filename in zip_ref.namelist():
print(f" - {filename}")
1条答案
按热度按时间rxztt3cl1#
您可以通过访问SparkFiles根目录列出通过--py-files提交的.whl文件。这里有一个最小的例子来实现这一点。假设这是你的spark-submit,
(in这个例子我已经添加了
dir2path-0.1.0-py3-none-any.whl
和whl-0.0.4-py2.py3-none-any.whl
)通过spark-submit运行的脚本
list_wheels.py
将打印通过--py-files提交的.whl文件的列表,使您能够确认哪些包已经上传。最终结果:
大量冗长的日志记录,最后:
注意:如果你也想查看wheel文件中的内容,你可以将脚本改为: