我正在使用spark 3.1并尝试读取JSON文件
我已将以下文件的架构定义为:
StructType([
StructField('search_metadata', MapType(StringType(),StringType())),
StructField('search_parameters', MapType(StringType(),StringType())),
StructField('search_information', MapType(StringType(),StringType())),
StructField('local_results',StructType([
StructField('position', StringType(), True),
StructField('title', StringType(), True),
StructField('place_id', StringType(), True),
StructField('data_id', StringType(), True),
StructField('data_cid', StringType(), True),
StructField('reviews_link', StringType(), True),
StructField('photos_link', StringType(), True),
StructField('gps_coordinates', MapType(StringType(),StringType()), True),
StructField('place_id_search', StringType(), True),
StructField('unclaimed_listing', BinaryType(), True),
StructField('type', StringType(), True),
StructField('address', StringType(), True),
StructField('open_state', StringType(), True),
StructField('hours', StringType(), True),
StructField('phone', MapType(StringType(),StringType()), True),
StructField('thumbnail', StringType(), True),
]), True),
StructField('serpapi_pagination',MapType(StringType(),StringType())),
StructField('search_query', StringType(), True),
])
我的JSON文件是:
[{
"search_metadata": {
"id": "63560cab66440a949ade5d72",
"status": "Success",
"json_endpoint": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.json",
"created_at": "2022-10-24 03:55:23 UTC",
"processed_at": "2022-10-24 03:55:23 UTC",
"google_maps_url": "https://www.google.com/maps/search/WH?hl=en",
"raw_html_file": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.html",
"total_time_taken": 1.91
},
"search_parameters": {
"engine": "google_maps",
"type": "search",
"q": "WH",
"google_domain": "google.com",
"hl": "en"
},
"search_information": {
"local_results_state": "Results for exact spelling",
"query_displayed": "WH"
},
"local_results": [{
"position": 1,
"title": "WH International Casting, LLC",
"place_id": "ChIJh0wvXcu_a4gRWuH-O1ltlPg",
"data_id": "0x886bbfcb5d2f4c87:0xf8946d593bfee15a",
"data_cid": "17912061847985381722",
"reviews_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_reviews&hl=en",
"photos_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_photos&hl=en",
"gps_coordinates": {
"latitude": 38.295865,
"longitude": -85.73001099999999
},
"place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a%218m2%213d38.295865%214d-85.73001099999999&engine=google_maps&google_domain=google.com&hl=en&type=place",
"unclaimed_listing": true,
"type": "Warehouse",
"address": "260 America Pl Dr, Jeffersonville, IN 47130",
"open_state": "Closed ⋅ Opens 8AM Mon",
"hours": "Closed ⋅ Opens 8AM Mon",
"operating_hours": {
"sunday": "Closed",
"monday": "8AM–4:30PM",
"tuesday": "8AM–4:30PM",
"wednesday": "8AM–4:30PM",
"thursday": "8AM–4:30PM",
"friday": "8AM–4:30PM",
"saturday": "Closed"
},
"phone": "(812) 725-8029",
"thumbnail": "https://lh5.googleusercontent.com/p/AF1QipPWDyyzxp1MG27vv3WVZbzy5WVI-Qh2u2jEDb-C=w122-h92-k-no"
},
{
"position": 2,
"title": "W.H. Smith Manor",
"place_id": "ChIJ9584e22DXIgR5w2f2saKBOU",
"data_id": "0x885c836d7b389ff7:0xe5048ac6da9f0de7",
"data_cid": "16502467521268354535",
"reviews_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_reviews&hl=en",
"photos_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_photos&hl=en",
"gps_coordinates": {
"latitude": 36.581589799999996,
"longitude": -83.6581731
},
"place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x885c836d7b389ff7%3A0xe5048ac6da9f0de7%218m2%213d36.581589799999996%214d-83.6581731&engine=google_maps&google_domain=google.com&hl=en&type=place",
"unclaimed_listing": true,
"type": "University department",
"address": "184 Robertson Ave, Harrogate, TN 37752",
"open_state": "Closed ⋅ Opens 8AM Mon",
"hours": "Closed ⋅ Opens 8AM Mon",
"operating_hours": {
"sunday": "Closed",
"monday": "8AM–4:30PM",
"tuesday": "8AM–4:30PM",
"wednesday": "8AM–4:30PM",
"thursday": "8AM–4:30PM",
"friday": "8AM–4:30PM",
"saturday": "Closed"
},
"phone": "(423) 869-3611",
"website": "http://lmunet.edu/",
"thumbnail": "https://streetviewpixels-pa.googleapis.com/v1/thumbnail?panoid=mJwpOER-2yIbmD3xSwQ2pQ&cb_client=search.gws-prod.gps&w=80&h=92&yaw=307.97266&pitch=0&thumbfov=100"
}
],
"serpapi_pagination": {
"next": "https://serpapi.com/search.json?engine=google_maps&google_domain=google.com&hl=en&q=WH&start=20&type=search"
},
"search_query": "WH.json"
}]
我尝试对某些行执行select
操作,但得到的值为空,我认为模式定义存在问题。
df = df.select(col('local_results'),
col('local_results.position').alias('position'),
col('local_results.title').alias('title'))
df.show()
有人能帮助我更正架构并显示结果吗?
1条答案
按热度按时间huwehgph1#
由于模式定义不正确,因此会得到空值。
要找出正确的模式,可以尝试阅读不带模式的文件,如下所示:
然后,您将能够在可以修改的位置修改模式。
使用上面的模式,
select
将工作:方案
使用
spark.read.option('multiline', 'true').json('file.json')
创建以下模式:您可以使用
df.schema
将其解压缩。它与您的方案不同,因为您的方案中也有一些 map 类型的列。如果您需要 map 列,可以这样做。
以下模式也可以使用:
结果:
但由于所有贴图值都被强制为 * 字符串 *,内部对象不能成为贴图。例如,完全展平后的
gps_coordinates
结果将如下所示: