python 抓取沃尔玛时按cookie阻止

kknvjkwl  于 2023-01-08  发布在  Python
关注(0)|答案(2)|浏览(605)

我有一个代码,将刮沃尔玛。它将输入约100个邮政编码为每个产品(300个总数),并刮相应的价格。我测试了3个产品的网址和2个邮政编码,它应该给予我的六个数据。它工作得很好,当我运行的代码无代理服务。输出是这样的:

43819800,041167412213,10003,3520,19.96,2022-07-05 14:06:47
43819800,041167412213,48104,5472,19.96,2022-07-05 14:06:47
224749468,300450206909,10003,3520,42.47,2022-07-05 14:06:49
224749468,300450206909,48104,5472,42.47,2022-07-05 14:06:50
14053317,681131187091,10003,3520,2.52,2022-07-05 14:06:51
14053317,681131187091,48104,5472,2.52,2022-07-05 14:06:52

我最终知道如果我想运行它的大量产品,我需要代理服务,所以我买了一个。但当我运行的代码与代理服务,大多数数据将丢失。
我想我不能得到所有数据的原因是因为cookie的问题。我想cookie =将过期,我不知道如何生成一个新的和正确的。有人知道如何处理cookie的问题时,刮沃尔玛?任何帮助感谢,谢谢!
下面是我的代码

def main(url_list, zip_code_list, ip_list, _now, save_dict, num, csv_list, utc_tz):
    _dict = {}
    debug = False
    s = requests.Session()

    output_json_file = f'backup/{num}_' + _now.strftime("%Y%m%d_%H%M.json")
    output_csv_file = f'backup/{num}_' + _now.strftime("%Y%m%d_%H%M.csv")

    flag = True

    if debug:
        url_list = [
            'https://www.target.com/p/claritin-24-hour-non-drowsy-allergy-relief-tablets-loratadine/-/A-80354268?preselect=14351285#lnk=sametab',
            'https://www.target.com/p/genexa-dextromethorphan-kids-39-cough-and-chest-congestion-suppressant-4-fl-oz/-/A-80130848#lnk=sametab'
            # 'https://www.target.com/p/claritin-24-hour-non-drowsy-allergy-relief-tablets-loratadine/-/A-80354268?preselect=14351285#lnk=sametab'
        ]
        zip_code_list = [
            10005,
        ]
    i = 0
    for _url in url_list:
        for zip_code in zip_code_list:
            # proxy service
            proxies = {"http": None, "https": None}
            # proxies = {
            ...,
            ...}

            i += 1
            _dict[i] = {}
            start_time = time.perf_counter()

            try:
                item = _url.split("/")[-1]  # A-80354268?preselect=14351285#lnk=sametab or A-80130848#lnk=sametab
                url_type = 1
                page_num = item

                if '?' in item:
                    url_type = 3
                    item2 = item.split("?")
                    page_num = item2[0]
                else:
                    pass
            except Exception as e:
                end_time = time.perf_counter()
                continue

            zip_code_url = "https://www.walmart.com/orchestra/home/graphql"

            payload = json.dumps({
                "query": "......",
                "variables": {
                    "input": {
                        "postalCode": str(zip_code),
                        "accessTypes": [
                            "PICKUP_INSTORE",
                            "PICKUP_CURBSIDE",
                            "PICKUP_SPOKE",
                            "PICKUP_POPUP"
                        ],
                        "nodeTypes": [
                            "STORE",
                            "PICKUP_SPOKE",
                            "PICKUP_POPUP"
                        ],
                        "latitude": None,
                        "longitude": None,
                        "radius": None
                    },
                    "checkItemAvailability": False,
                    "checkWeeklyReservation": False,
                    "enableStoreSelectorMarketplacePickup": False
                }
            })
            headers = {
                'authority': 'www.walmart.com',
                'pragma': 'no-cache',
                'cache-control': 'no-cache',
                'x-o-segment': 'oaoh',
                'x-o-correlation-id': 'Tt33HoVZ_Pqtlie1ABII1nfekFaSEtbRQPSc',
                'device_profile_ref_id': '-f6R8qf8Vd3gwky1UOzoEwW_XoTeRKqppMfK',
                'x-latency-trace': '1',
                'wm_mp': 'true',
                'wm_page_url': 'https://www.walmart.com/ip/Allegra-Adult-24HR-Gelcaps-24-Ct-180-mg-Allergy-Relief/43819800',
                'x-o-platform-version': 'main-1.2.0-3a465c',
                'x-o-gql-query': 'query nearByNodes',
                'x-o-bu': 'WALMART-US',
                'x-apollo-operation-name': 'nearByNodes',
                'traceparent': 'Tt33HoVZ_Pqtlie1ABII1nfekFaSEtbRQPSc',
                'x-o-mart': 'B2C',
                'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
                'x-o-platform': 'rweb',
                'content-type': 'application/json',
                'accept': 'application/json',
                'x-enable-server-timing': '1',
                'x-o-ccm': 'server',
                'wm_qos.correlation_id': 'Tt33HoVZ_Pqtlie1ABII1nfekFaSEtbRQPSc',
                'origin': 'https://www.walmart.com',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://www.walmart.com/ip/Allegra-Adult-24HR-Gelcaps-24-Ct-180-mg-Allergy-Relief/43819800',
                'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
                'cookie': '_pxvid=10811607-f238-11ec-a720-4e756c594d76; ACID=2263b9c6-4e5a-44ce-a9da-05e028a9b8c7; hasACID=true; vtc=ShJZRQkr5ADHxW2cc6mpW0; TBV=7; adblocked=false; locGuestData=eyJpbnRlbnQiOiJTSElQUElORyIsImlzRXhwbGljaXQiOmZhbHNlLCJzdG9yZUludGVudCI6IlBJQ0tVUCIsIm1lcmdlRmxhZyI6ZmFsc2UsImlzRGVmYXVsdGVkIjpmYWxzZSwicGlja3VwIjp7Im5vZGVJZCI6IjMwODEiLCJ0aW1lc3RhbXAiOjE2NTU5MDgzNjMwNTF9LCJwb3N0YWxDb2RlIjp7InRpbWVzdGFtcCI6MTY1NTkwODM2MzA1MSwiYmFzZSI6Ijk1ODI5In0sInZhbGlkYXRlS2V5IjoicHJvZDp2MjoyMjYzYjljNi00ZTVhLTQ0Y2UtYTlkYS0wNWUwMjhhOWI4YzcifQ%3D%3D; tb_sw_supported=false; auth=MTAyOTYyMDE4fNO1docV1h53scwTVZ09zkq8CMnyd0xQJ2B%2BWcsOyT6LROWRUsmVs%2Bp%2BqdGXlTGcs43hwU%2BCNtvvfjRjs7lCjIvrK3NOp%2FSayADm%2FauT3or57dQr8nIRizNO0Go9X0h5767wuZloTfhm7Wk2KcjygsAEeU%2BeKCMhfP9XV060SY%2FgcNaenrudSCZlEFJXNSGF6XDQTWwNYV8JcXQmjWW2gCuRvXtkjUhxyHY9czMQpG4UMk70P8glgOEpLOprhDfMM%2FFHGZ2dCNmxWrdkwqEKrhrUOgaJ2pqn5A3SHetSvGPUoDKun8p%2FM%2BS69xpJ5GAkRAvQ9UuySW7l7kgiPOgyF60jwwfDPWGSAbjbNQ7pWssaQQ2kgt4PeOLZzshCiRmgr%2B51HN4wOouRjTjluv08HZE5WBBdZBCyKnCQAR7o6eg%3D; assortmentStoreId=3081; hasLocData=1; TB_Latency_Tracker_100=1; TB_Navigation_Preload_01=1; TB_SFOU-100=; bstc=bdAHA-WWcfXk0PMnP-uT0o; mobileweb=0; xpa=3Fi1g|3_gkh|3pRU7|4NCWH|55b29|5_9FA|DAwQd|Ecx7k|EjkLl|FYe-R|Hv6FZ|LTD5Y|LguYm|NbUbl|NoJl6|O1c3v|OuwKl|Pgtnl|Q-bGe|TMjj7|V0SkO|VAuQw|_hSAz|cL8HI|ccWng|cfVAR|duBe9|eEnay|eWARP|hGNr-|hPI48|hqy5q|jUi64|kFqfr|kLRY3|lQHtM|rdfjX|wGrec|zCylr; xpm=1%2B1656425217%2BShJZRQkr5ADHxW2cc6mpW0~%2B0; exp-ck=3_gkh24NCWH155b2925_9FA1DAwQd3Ecx7k1EjkLl1FYe-R1NbUbl1O1c3v1OuwKl1Pgtnl1V0SkO1_hSAz1cL8HI1ccWng2eEnay2eWARP1hGNr-1jUi641kFqfr1kLRY31lQHtM1; helpgql=1; _pxhd=f5c62b38667d2415146fc6f1cf93cdcc3327afc72fb672b6c09d362415e56f38:10811607-f238-11ec-a720-4e756c594d76; ak_bmsc=1BE44332F7F4F1A06C35DB50CE7A244A~000000000000000000000000000000~YAAQLcMTArBBZZeBAQAArhKjqhBsrms6EgHZfDiLKYf+xjOEF+/vrnzQPmoENr4GabQK+uGCmEd636jjgHyE5IXaf6eZM80f5m5gykIdVmdAcFD78W+uLxPV7zlVbetRHR4yQ3osOU9yTbAIxm5And82l8zx1c+OLZZDAn6cC8CKtnboNmXITB0mT+0BxrSMWr6FUUHNgN0BjPwCEW0NqdGPe6o57ttnUoJELTEeXnUdiAZB784srPtFJgZmxk8F6jADpYMaSrqGlGv+Uh2jVAHJRIYeQU3kcxbocc7/nJrGHHFJn0lbXOoltW7Qz8AFPIgHhHWP9/2COrQo/EWwZm/88zqCi9/l7kD7dtgabe1ICq/CgDz4rl41QKOe+lZYQVf9uqF0CrTfea/anOjVdEHEXZvaGq+803mSqffDitzIXTNDym8PnPeMFa4WJTqCF/apIJiT2rvLKPRgn2CsOlxCTkHlAuCUJG8JYWYGloDso0ULfjd/xEhxhe6CCy6HYwJ3E8fkVF8i5dG04U+o/URP05RiYZc8F+F1dMl91Q==; bm_sv=84EAF547030A90DB5BCD4679165B254D~YAAQLcMTAvZBZZeBAQAAthWjqhAc7AUJuL6t1BdztQNVwhWMVE5DVAXWjtGVZh8Xm0VgJ47u+ho0b3dkGGOwoGGOwbooZSpjcjxOOiEXIzTZgWRkQQeyTz5OB8ixRgE2Fqg1vQiCeOZkQWRmu+zj+P9ZwuNPusS4/dIStSfzYA4sDRQ6KPwQMqsMLFvPhDxjiR/ByNKugRT//CgOWQgb12FlDL5o/kJUQaRsUA00Dc9zKt/XUexTgCG91S5atV9iXQ==~1; locDataV3=eyJpc0RlZmF1bHRlZCI6ZmFsc2UsImlzRXhwbGljaXQiOmZhbHNlLCJpbnRlbnQiOiJTSElQUElORyIsInBpY2t1cCI6W3siYnVJZCI6IjAiLCJub2RlSWQiOiIzMDgxIiwiZGlzcGxheU5hbWUiOiJTYWNyYW1lbnRvIFN1cGVyY2VudGVyIiwibm9kZVR5cGUiOiJTVE9SRSIsImFkZHJlc3MiOnsicG9zdGFsQ29kZSI6Ijk1ODI5IiwiYWRkcmVzc0xpbmUxIjoiODkxNSBHZXJiZXIgUm9hZCIsImNpdHkiOiJTYWNyYW1lbnRvIiwic3RhdGUiOiJDQSIsImNvdW50cnkiOiJVUyIsInBvc3RhbENvZGU5IjoiOTU4MjktMDAwMCJ9LCJnZW9Qb2ludCI6eyJsYXRpdHVkZSI6MzguNDgyNjc3LCJsb25naXR1ZGUiOi0xMjEuMzY5MDI2fSwiaXNHbGFzc0VuYWJsZWQiOnRydWUsInNjaGVkdWxlZEVuYWJsZWQiOnRydWUsInVuU2NoZWR1bGVkRW5hYmxlZCI6dHJ1ZSwiaHViTm9kZUlkIjoiMzA4MSIsInN0b3JlSHJzIjoiMDY6MDAtMjM6MDAiLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6WyJQSUNLVVBfQ1VSQlNJREUiLCJQSUNLVVBfSU5TVE9SRSJdfV0sInNoaXBwaW5nQWRkcmVzcyI6eyJsYXRpdHVkZSI6MzguNDc0NCwibG9uZ2l0dWRlIjotMTIxLjM0MzcsInBvc3RhbENvZGUiOiI5NTgyOSIsImNpdHkiOiJTYWNyYW1lbnRvIiwic3RhdGUiOiJDQSIsImNvdW50cnlDb2RlIjoiVVNBIiwiZ2lmdEFkZHJlc3MiOmZhbHNlfSwiYXNzb3J0bWVudCI6eyJub2RlSWQiOiIzMDgxIiwiZGlzcGxheU5hbWUiOiJTYWNyYW1lbnRvIFN1cGVyY2VudGVyIiwiYWNjZXNzUG9pbnRzIjpudWxsLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6W10sImludGVudCI6IlBJQ0tVUCIsInNjaGVkdWxlRW5hYmxlZCI6ZmFsc2V9LCJkZWxpdmVyeSI6eyJidUlkIjoiMCIsIm5vZGVJZCI6IjMwODEiLCJkaXNwbGF5TmFtZSI6IlNhY3JhbWVudG8gU3VwZXJjZW50ZXIiLCJub2RlVHlwZSI6IlNUT1JFIiwiYWRkcmVzcyI6eyJwb3N0YWxDb2RlIjoiOTU4MjkiLCJhZGRyZXNzTGluZTEiOiI4OTE1IEdlcmJlciBSb2FkIiwiY2l0eSI6IlNhY3JhbWVudG8iLCJzdGF0ZSI6IkNBIiwiY291bnRyeSI6IlVTIiwicG9zdGFsQ29kZTkiOiI5NTgyOS0wMDAwIn0sImdlb1BvaW50Ijp7ImxhdGl0dWRlIjozOC40ODI2NzcsImxvbmdpdHVkZSI6LTEyMS4zNjkwMjZ9LCJpc0dsYXNzRW5hYmxlZCI6dHJ1ZSwic2NoZWR1bGVkRW5hYmxlZCI6dHJ1ZSwidW5TY2hlZHVsZWRFbmFibGVkIjp0cnVlLCJhY2Nlc3NQb2ludHMiOlt7ImFjY2Vzc1R5cGUiOiJERUxJVkVSWV9BRERSRVNTIn1dLCJodWJOb2RlSWQiOiIzMDgxIiwiaXNFeHByZXNzRGVsaXZlcnlPbmx5IjpmYWxzZSwic3VwcG9ydGVkQWNjZXNzVHlwZXMiOlsiREVMSVZFUllfQUREUkVTUyJdfSwiaW5zdG9yZSI6ZmFsc2UsInJlZnJlc2hBdCI6MTY1NjQ0NjgyMjk5NCwidmFsaWRhdGVLZXkiOiJwcm9kOnYyOjIyNjNiOWM2LTRlNWEtNDRjZS1hOWRhLTA1ZTAyOGE5YjhjNyJ9; _px3=fd4a806b205916413bf99a01e942ff9336232e851610f563e729bd8270721edd:bOVkbtqNZf6CXjv41nbl5RGLFNxuANcsgSOgoqttHpCpEzg2Mto0wjrxDFfh6zUSiA5wDDm5rTHKfX2lAiPg0Q==:1000:zDPkfwX/OSrZ75Ggjs1Krpm4L6f17sXBDANaE4TV+9j6Y6dnGPRddnxuV+8zV6iiq/iJexlqrtw3brpn59WivGDsHGwucjfO5cRyfNrUryok4xbUwr1yK/iAyP1t4vdvf8bS4jGOBM9xp8zMe44W7tOveajOsuF64IAfP4GPGNBGPUZkIE3I+bocrKQJA7sdD12/BOw6goT1VCddGHvnEQ==; QuantumMetricSessionID=18cfa4319b8ef1e79540e53bae1b9f4a; QuantumMetricUserID=86906ef3aba513b1593763543a679f7b; xptwg=3560776434:CF9364D4F670D8:21A39D6:281D1B4C:2C081154:A12BB2D4:; TS01b0be75=01538efd7cb337960d25ee6309ddbe2d9b73d42de4db21f151df666ad7ab7cde6a6e48b30e01e0ae9a71bdd492f4d3cefe3367d67f; TS013ed49a=01538efd7cb337960d25ee6309ddbe2d9b73d42de4db21f151df666ad7ab7cde6a6e48b30e01e0ae9a71bdd492f4d3cefe3367d67f; akavpau_p2=1656426003~id=7229bf055a987cf4ec0baf62877b3b53; _astc=f75dda122e22d06cf00905ef84d586f8; pxcts=f6d5b40e-f6eb-11ec-b18e-5746686f6a76; _pxff_cfp=1'            }

            try:
                response = requests.request("POST", zip_code_url, headers=headers, data=payload, proxies=proxies,timeout=10)
                content = response.json()
            except Exception as e:
                end_time = time.perf_counter()
                continue

            try:
                store_id = content['data']['nearByNodes']['nodes'][0]['id']
            except Exception as e:
                end_time = time.perf_counter()
                continue

            url2 = "https://www.walmart.com/orchestra/home/graphql/ip/"+page_num
            payload2 = json.dumps({
                "query": "...",
                "variables": {
                    "channel": "WWW",
                    "pageType": "ItemPageGlobal",
                    "tenant": "WM_GLASS",
                    "version": "v1",
                    "itemId": str(page_num),
                    "layout": [
                        "itemDesktop"
                    ],
                    "tempo": {
                        "targeting": "%7B%22userState%22%3A%22loggedIn%22%7D",
                        "params": [
                            {
                                "key": "expoVars",
                                "value": "expoVariationValue"
                            },
                            {
                                "key": "expoVars",
                                "value": "expoVariationValue2"
                            }
                        ]
                    },
                    "p13N": {
                        "reqId": "zAKgORT4feGddLyly5nLXRCH16egOJ0JwL4x",
                        "pageId": str(page_num),
                        "modules": [
                            {
                                "moduleType": "PersonalizedLabels",
                                "moduleId": "234-sdfsfvns-sdfdskvl"
                            }
                        ],
                        "userClientInfo": {
                            "ipAddress": "IP=0:0:0:0:0:0:0:1-0:0:0:0:0:0:0:1",
                            "isZipLocated": True,
                            "callType": "CLIENT",
                            "deviceType": "desktop"
                        },
                        "userReqInfo": {
                            "refererContext": {
                                "source": "itempage"
                            },
                            "pageUrl": "/ip/Equate-Maximum-Strength-Severe-Allergy-Plus-Sinus-Headache-Caplets-20-Count/14053317"
                        }
                    },
                    "p13nCls": {
                        "pageId": str(page_num),
                        "userClientInfo": {
                            "ipAddress": "IP=0:0:0:0:0:0:0:1-0:0:0:0:0:0:0:1",
                            "isZipLocated": True,
                            "deviceType": "desktop",
                            "callType": "CLIENT"
                        },
                        "userReqInfo": {
                            "refererContext": {
                                "source": "itempage"
                            }
                        },
                        "p13NCallType": "ATF"
                    },
                    "fetchBuyBoxAd": True,
                    "fetchSkyline": True,
                    "fetchIdml": True,
                    "fetchReviews": True,
                    "fetchFitment": True,
                    "fetchSEO": True,
                    "fetchP13N": True,
                    "fetchAffirm": True,
                    "fetchMarquee": True,
                    "fetchSpCarousel": True,
                    "fetchBrandBox": True,
                    "fetchDiscounts": False,
                    "enableItemIbotta": True
                }
            })
            headers2 = {
                'authority': 'www.walmart.com',
                'pragma': 'no-cache',
                'cache-control': 'no-cache',
                'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
                'x-o-correlation-id': 'EtyZZHcTSBCiVMxl4i44a6h_Gzqf1F-MkpXd',
                'x-o-item-id': str(page_num),
                'device_profile_ref_id': '-1fXNwg-2wCxSXoXmzk8jg4T_XgDMMHhHTMN',
                'x-latency-trace': '1',
                'wm_mp': 'true',
                'wm_page_url': 'https://www.walmart.com/ip/Equate-Maximum-Strength-Severe-Allergy-Plus-Sinus-Headache-Caplets-20-Count/14053317',
                'x-o-platform-version': 'main-1.3.0-e51fc3',
                'x-o-segment': 'oaoh',
                'calltype': 'CLIENT',
                'x-o-gql-query': 'query ItemById',
                'x-o-bu': 'WALMART-US',
                'x-apollo-operation-name': 'ItemById',
                'ip-referer': '',
                'sec-ch-ua-platform': '"Linux"',
                'traceparent': 'EtyZZHcTSBCiVMxl4i44a6h_Gzqf1F-MkpXd',
                'x-o-mart': 'B2C',
                'sec-ch-ua-mobile': '?0',
                'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
                'x-o-platform': 'rweb',
                'content-type': 'application/json',
                'accept': 'application/json',
                'is-variant-fetch': 'false',
                'x-enable-server-timing': '1',
                'x-o-ccm': 'server',
                'wm_qos.correlation_id': 'EtyZZHcTSBCiVMxl4i44a6h_Gzqf1F-MkpXd',
                'origin': 'https://www.walmart.com',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://www.walmart.com/ip/Equate-Maximum-Strength-Severe-Allergy-Plus-Sinus-Headache-Caplets-20-Count/14053317',
                'accept-language': 'en-US,en;q=0.9',
                'cookie': '_pxvid=10811607-f238-11ec-a720-4e756c594d76; ACID=2263b9c6-4e5a-44ce-a9da-05e028a9b8c7; hasACID=true; vtc=ShJZRQkr5ADHxW2cc6mpW0; TBV=7; adblocked=false; locGuestData=eyJpbnRlbnQiOiJTSElQUElORyIsImlzRXhwbGljaXQiOmZhbHNlLCJzdG9yZUludGVudCI6IlBJQ0tVUCIsIm1lcmdlRmxhZyI6ZmFsc2UsImlzRGVmYXVsdGVkIjpmYWxzZSwicGlja3VwIjp7Im5vZGVJZCI6IjMwODEiLCJ0aW1lc3RhbXAiOjE2NTU5MDgzNjMwNTF9LCJwb3N0YWxDb2RlIjp7InRpbWVzdGFtcCI6MTY1NTkwODM2MzA1MSwiYmFzZSI6Ijk1ODI5In0sInZhbGlkYXRlS2V5IjoicHJvZDp2MjoyMjYzYjljNi00ZTVhLTQ0Y2UtYTlkYS0wNWUwMjhhOWI4YzcifQ%3D%3D; tb_sw_supported=false; auth=MTAyOTYyMDE4fNO1docV1h53scwTVZ09zkq8CMnyd0xQJ2B%2BWcsOyT6LROWRUsmVs%2Bp%2BqdGXlTGcs43hwU%2BCNtvvfjRjs7lCjIvrK3NOp%2FSayADm%2FauT3or57dQr8nIRizNO0Go9X0h5767wuZloTfhm7Wk2KcjygsAEeU%2BeKCMhfP9XV060SY%2FgcNaenrudSCZlEFJXNSGF6XDQTWwNYV8JcXQmjWW2gCuRvXtkjUhxyHY9czMQpG4UMk70P8glgOEpLOprhDfMM%2FFHGZ2dCNmxWrdkwqEKrhrUOgaJ2pqn5A3SHetSvGPUoDKun8p%2FM%2BS69xpJ5GAkRAvQ9UuySW7l7kgiPOgyF60jwwfDPWGSAbjbNQ7pWssaQQ2kgt4PeOLZzshCiRmgr%2B51HN4wOouRjTjluv08HZE5WBBdZBCyKnCQAR7o6eg%3D; assortmentStoreId=3081; hasLocData=1; TB_Latency_Tracker_100=1; TB_Navigation_Preload_01=1; TB_SFOU-100=; bstc=bdAHA-WWcfXk0PMnP-uT0o; mobileweb=0; xpa=3Fi1g|3_gkh|3pRU7|4NCWH|55b29|5_9FA|DAwQd|Ecx7k|EjkLl|FYe-R|Hv6FZ|LTD5Y|LguYm|NbUbl|NoJl6|O1c3v|OuwKl|Pgtnl|Q-bGe|TMjj7|V0SkO|VAuQw|_hSAz|cL8HI|ccWng|cfVAR|duBe9|eEnay|eWARP|hGNr-|hPI48|hqy5q|jUi64|kFqfr|kLRY3|lQHtM|rdfjX|wGrec|zCylr; xpm=1%2B1656425217%2BShJZRQkr5ADHxW2cc6mpW0~%2B0; exp-ck=3_gkh24NCWH155b2925_9FA1DAwQd3Ecx7k1EjkLl1FYe-R1NbUbl1O1c3v1OuwKl1Pgtnl1V0SkO1_hSAz1cL8HI1ccWng2eEnay2eWARP1hGNr-1jUi641kFqfr1kLRY31lQHtM1; helpgql=1; _pxhd=f5c62b38667d2415146fc6f1cf93cdcc3327afc72fb672b6c09d362415e56f38:10811607-f238-11ec-a720-4e756c594d76; ak_bmsc=1BE44332F7F4F1A06C35DB50CE7A244A~000000000000000000000000000000~YAAQLcMTArBBZZeBAQAArhKjqhBsrms6EgHZfDiLKYf+xjOEF+/vrnzQPmoENr4GabQK+uGCmEd636jjgHyE5IXaf6eZM80f5m5gykIdVmdAcFD78W+uLxPV7zlVbetRHR4yQ3osOU9yTbAIxm5And82l8zx1c+OLZZDAn6cC8CKtnboNmXITB0mT+0BxrSMWr6FUUHNgN0BjPwCEW0NqdGPe6o57ttnUoJELTEeXnUdiAZB784srPtFJgZmxk8F6jADpYMaSrqGlGv+Uh2jVAHJRIYeQU3kcxbocc7/nJrGHHFJn0lbXOoltW7Qz8AFPIgHhHWP9/2COrQo/EWwZm/88zqCi9/l7kD7dtgabe1ICq/CgDz4rl41QKOe+lZYQVf9uqF0CrTfea/anOjVdEHEXZvaGq+803mSqffDitzIXTNDym8PnPeMFa4WJTqCF/apIJiT2rvLKPRgn2CsOlxCTkHlAuCUJG8JYWYGloDso0ULfjd/xEhxhe6CCy6HYwJ3E8fkVF8i5dG04U+o/URP05RiYZc8F+F1dMl91Q==; bm_sv=84EAF547030A90DB5BCD4679165B254D~YAAQLcMTAvZBZZeBAQAAthWjqhAc7AUJuL6t1BdztQNVwhWMVE5DVAXWjtGVZh8Xm0VgJ47u+ho0b3dkGGOwoGGOwbooZSpjcjxOOiEXIzTZgWRkQQeyTz5OB8ixRgE2Fqg1vQiCeOZkQWRmu+zj+P9ZwuNPusS4/dIStSfzYA4sDRQ6KPwQMqsMLFvPhDxjiR/ByNKugRT//CgOWQgb12FlDL5o/kJUQaRsUA00Dc9zKt/XUexTgCG91S5atV9iXQ==~1; locDataV3=eyJpc0RlZmF1bHRlZCI6ZmFsc2UsImlzRXhwbGljaXQiOmZhbHNlLCJpbnRlbnQiOiJTSElQUElORyIsInBpY2t1cCI6W3siYnVJZCI6IjAiLCJub2RlSWQiOiIzMDgxIiwiZGlzcGxheU5hbWUiOiJTYWNyYW1lbnRvIFN1cGVyY2VudGVyIiwibm9kZVR5cGUiOiJTVE9SRSIsImFkZHJlc3MiOnsicG9zdGFsQ29kZSI6Ijk1ODI5IiwiYWRkcmVzc0xpbmUxIjoiODkxNSBHZXJiZXIgUm9hZCIsImNpdHkiOiJTYWNyYW1lbnRvIiwic3RhdGUiOiJDQSIsImNvdW50cnkiOiJVUyIsInBvc3RhbENvZGU5IjoiOTU4MjktMDAwMCJ9LCJnZW9Qb2ludCI6eyJsYXRpdHVkZSI6MzguNDgyNjc3LCJsb25naXR1ZGUiOi0xMjEuMzY5MDI2fSwiaXNHbGFzc0VuYWJsZWQiOnRydWUsInNjaGVkdWxlZEVuYWJsZWQiOnRydWUsInVuU2NoZWR1bGVkRW5hYmxlZCI6dHJ1ZSwiaHViTm9kZUlkIjoiMzA4MSIsInN0b3JlSHJzIjoiMDY6MDAtMjM6MDAiLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6WyJQSUNLVVBfQ1VSQlNJREUiLCJQSUNLVVBfSU5TVE9SRSJdfV0sInNoaXBwaW5nQWRkcmVzcyI6eyJsYXRpdHVkZSI6MzguNDc0NCwibG9uZ2l0dWRlIjotMTIxLjM0MzcsInBvc3RhbENvZGUiOiI5NTgyOSIsImNpdHkiOiJTYWNyYW1lbnRvIiwic3RhdGUiOiJDQSIsImNvdW50cnlDb2RlIjoiVVNBIiwiZ2lmdEFkZHJlc3MiOmZhbHNlfSwiYXNzb3J0bWVudCI6eyJub2RlSWQiOiIzMDgxIiwiZGlzcGxheU5hbWUiOiJTYWNyYW1lbnRvIFN1cGVyY2VudGVyIiwiYWNjZXNzUG9pbnRzIjpudWxsLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6W10sImludGVudCI6IlBJQ0tVUCIsInNjaGVkdWxlRW5hYmxlZCI6ZmFsc2V9LCJkZWxpdmVyeSI6eyJidUlkIjoiMCIsIm5vZGVJZCI6IjMwODEiLCJkaXNwbGF5TmFtZSI6IlNhY3JhbWVudG8gU3VwZXJjZW50ZXIiLCJub2RlVHlwZSI6IlNUT1JFIiwiYWRkcmVzcyI6eyJwb3N0YWxDb2RlIjoiOTU4MjkiLCJhZGRyZXNzTGluZTEiOiI4OTE1IEdlcmJlciBSb2FkIiwiY2l0eSI6IlNhY3JhbWVudG8iLCJzdGF0ZSI6IkNBIiwiY291bnRyeSI6IlVTIiwicG9zdGFsQ29kZTkiOiI5NTgyOS0wMDAwIn0sImdlb1BvaW50Ijp7ImxhdGl0dWRlIjozOC40ODI2NzcsImxvbmdpdHVkZSI6LTEyMS4zNjkwMjZ9LCJpc0dsYXNzRW5hYmxlZCI6dHJ1ZSwic2NoZWR1bGVkRW5hYmxlZCI6dHJ1ZSwidW5TY2hlZHVsZWRFbmFibGVkIjp0cnVlLCJhY2Nlc3NQb2ludHMiOlt7ImFjY2Vzc1R5cGUiOiJERUxJVkVSWV9BRERSRVNTIn1dLCJodWJOb2RlSWQiOiIzMDgxIiwiaXNFeHByZXNzRGVsaXZlcnlPbmx5IjpmYWxzZSwic3VwcG9ydGVkQWNjZXNzVHlwZXMiOlsiREVMSVZFUllfQUREUkVTUyJdfSwiaW5zdG9yZSI6ZmFsc2UsInJlZnJlc2hBdCI6MTY1NjQ0NjgyMjk5NCwidmFsaWRhdGVLZXkiOiJwcm9kOnYyOjIyNjNiOWM2LTRlNWEtNDRjZS1hOWRhLTA1ZTAyOGE5YjhjNyJ9; _px3=fd4a806b205916413bf99a01e942ff9336232e851610f563e729bd8270721edd:bOVkbtqNZf6CXjv41nbl5RGLFNxuANcsgSOgoqttHpCpEzg2Mto0wjrxDFfh6zUSiA5wDDm5rTHKfX2lAiPg0Q==:1000:zDPkfwX/OSrZ75Ggjs1Krpm4L6f17sXBDANaE4TV+9j6Y6dnGPRddnxuV+8zV6iiq/iJexlqrtw3brpn59WivGDsHGwucjfO5cRyfNrUryok4xbUwr1yK/iAyP1t4vdvf8bS4jGOBM9xp8zMe44W7tOveajOsuF64IAfP4GPGNBGPUZkIE3I+bocrKQJA7sdD12/BOw6goT1VCddGHvnEQ==; QuantumMetricSessionID=18cfa4319b8ef1e79540e53bae1b9f4a; QuantumMetricUserID=86906ef3aba513b1593763543a679f7b; xptwg=3560776434:CF9364D4F670D8:21A39D6:281D1B4C:2C081154:A12BB2D4:; TS01b0be75=01538efd7cb337960d25ee6309ddbe2d9b73d42de4db21f151df666ad7ab7cde6a6e48b30e01e0ae9a71bdd492f4d3cefe3367d67f; TS013ed49a=01538efd7cb337960d25ee6309ddbe2d9b73d42de4db21f151df666ad7ab7cde6a6e48b30e01e0ae9a71bdd492f4d3cefe3367d67f; akavpau_p2=1656426003~id=7229bf055a987cf4ec0baf62877b3b53; _astc=f75dda122e22d06cf00905ef84d586f8; pxcts=f6d5b40e-f6eb-11ec-b18e-5746686f6a76; _pxff_cfp=1'
            }
            try:
                response2 = requests.request("POST", url2, headers=headers2, data=payload2, proxies=proxies,timeout=10)
                content2 = response2.json()
            except Exception as e:
                end_time = time.perf_counter()
                continue

            _now2 = datetime.datetime.now(tz=utc_tz)

            try:
                product_price = content2['data']['product']['priceInfo']['currentPrice']['price']

            except Exception as e:
                product_price = None

            try:
                product_gtin = content2['data']['product']['upc']
            except Exception as e:
                product_gtin = None
            try:
                pruduct_time = _now2.strftime("%Y-%m-%d %H:%M:%S")
                _dict[i] = {
                    'page_num': page_num,
                    'product_gtin': product_gtin,
                    'store_id': store_id,
                    'product_price': product_price,
                    'zip_code': zip_code,
                    'pruduct_time': pruduct_time
                }

                with open(output_csv_file, "a", encoding='utf-8') as fw2:
                    new_line = f'{page_num},{product_gtin},{zip_code},{store_id},{product_price},{pruduct_time}\n'
                    fw2.write(new_line)
                    if output_csv_file not in csv_list:
                        csv_list.append(output_csv_file)
                    end_time = time.perf_counter()
            except Exception as e:
                end_time = time.perf_counter()

    # print(_dict)
    with open(output_json_file, "w", encoding='utf-8') as fw3:
        fw3.write(json.dumps(_dict))
    save_dict[num] = _dict
    return _dict

if __name__ == '__main__':
    # output_csv_file = 'output.csv'

    if not os.path.exists('output'):
        os.makedirs('output')
    else:
        pass
    if not os.path.exists('backup'):
        os.makedirs('backup')
    else:
        pass
    utc_tz = pytz.timezone('UTC')
    _now = datetime.datetime.now(tz=utc_tz)

    ip_list = [
        '45.142.28.83:8094',
        '45.137.60.112:6640',
    ]
    url_list = [
        # 'https://www.walmart.com/browse/health/allergy-and-sinus/976760_3771182',
        'https://www.walmart.com/ip/Allegra-Adult-24HR-Gelcaps-24-Ct-180-mg-Allergy-Relief/43819800',
        'https://www.walmart.com/ip/Zyrtec-24-Hour-Allergy-Relief-Tablets-with-10-mg-Cetirizine-HCl-90-ct/224749468?athbdg=L1600',
        'https://www.walmart.com/ip/Equate-Maximum-Strength-Severe-Allergy-Plus-Sinus-Headache-Caplets-20-Count/14053317'
    ]
    zip_code_list = [
        10003,
        48104
    ]

    start_time = time.perf_counter()
    save_dict = {}

    thread_num = 1
    one_thread_url_num = 3
    pool = threadpool.ThreadPool(thread_num)

    param_list = []
    csv_list = []
    for i in range(thread_num):
        save_dict[i + 1] = {}
        start_url_num = i * one_thread_url_num
        end_url_num = start_url_num + one_thread_url_num
        if end_url_num >= len(url_list):
            param_list.append(([url_list[start_url_num:], zip_code_list, ip_list, _now, save_dict, i + 1,
                                csv_list, utc_tz], None))
            print("start_url_num", start_url_num, end_url_num)

        else:
            param_list.append(([url_list[start_url_num:end_url_num], zip_code_list, ip_list, _now, save_dict, i + 1,
                                csv_list, utc_tz], None))
            print("start_url_num", start_url_num, len(url_list))
    tasks = threadpool.makeRequests(main, param_list)
    [pool.putRequest(task) for task in tasks]

    pool.wait()
myss37ts

myss37ts1#

我不知道这是否有帮助,但他们使用了大量的cookie。以下是我的系统,但你必须检查你的代理,如果它仍然工作:

import requests

cookies = {
    'TB_Latency_Tracker_100': '1',
    'TB_Navigation_Preload_01': '1',
    'TB_SFOU-100': '',
    'vtc': 'XlaYVaEkh8T8kzzvS_hIbk',
    'bstc': 'XlaYVaEkh8T8kzzvS_hIbk',
    'mobileweb': '0',
    'xpa': '2Oqb6|3_gkh|4NCWH|4QHB7|77hfu|BIcmp|CN28l|DAwQd|E4WND|FYe-R|LTD5Y|LguYm|PNKHT|Q-bGe|Uf0oj|V0SkO|VQW_o|ZZeL5|_hSAz|_nCIe|cfVAR|fSXlM|fsM8g|gVG-b|hPI48|hqy5q|mrxWN|myr6S|nzyw-|u_9Xw|uru_L|w_GEw|xCzID|yQ2ZK|ymjfc|yxNJ6',
    'xpm': '1%2B1657553050%2BXlaYVaEkh8T8kzzvS_hIbk~%2B0',
    'exp-ck': '3_gkh24NCWH177hfu1DAwQd3E4WND1FYe-R1V0SkO1VQW_o1_hSAz1fsM8g1gVG-b1nzyw-1u_9Xw1w_GEw1yxNJ63',
    '_pxhd': '50d3bc3fa5826eff6126ffc7af49a2537ce11bb3e08dd63bd36c5d81bf0bd889:836e203e-012d-11ed-ab53-626a69424550',
    'TS01b0be75': '01538efd7cc6666f1a2641aa6310df48b0aa83bef1d4e8c021bebea253322db7f36548b6555ae1f2d25935d08c0152b6e024d98a68',
    'TS013ed49a': '01538efd7c23b1cfcf8ec2a792dcc79a4900439ee48c91b75191292f79f95af4844015f66e746c23c227f7624f91d1aa5a91748452',
    'ak_bmsc': 'CB7DC8666D2ADCC2DA7D98F41C4E9A7C~000000000000000000000000000000~YAAQheXKF4xyNa+BAQAA4ebc7RCezmbxjoNQoldyymwp47ytWLpq6whkKk4LCyg8v6FlQBzYDJfjbRhZZHaJgXt9JkKmMz6BgpedzCBNMhqkB3cXswVF4q7baZyrBrfdfLuXzPraZnAEmrFwLexgzzVjFyYUPv8YPu4soXYRalAMzdSdvn2F66ETsPJkTGTEXp5qbqfgc/bo7VCofPdS8dZsKDO9PCCpG9Z+qovsC0hoQGyzU+MRxzXfhPLTphj8YXenXxdpF3J6mH2favo+o49v7weQuyYzZQqlnd7UgQy8HQcyxt7+DjYZBC4I4XKiRGH+ad5+Z3nWGfFz4ObNwjzNKKqArzupz0hZtsMGT9wS+exN4O785ITA7DIgoOh4ZqWmpIhFjRy9eyhb8hXNbgIxJHGOxa2FwqwAoqajFBH1vSmgQ1H/bAvZctXEdgbrQkTwcl3v/RQQUona04Wel8J7XH16aJn9/LbUM3buDWFCy/IBDIZWSrvEBTUiqJF4sSNW/ApxjtAznXvPoCgyF7enCZ0f8srzYhOiOr4lqygcvHv5am/xw9yHpDLtxrMSpmEmiyP7GT11POiYXq/3jzf44fkCDsVIs6zcXtEiKLKDgQ==',
    'AID': 'wmlspartner%253D0%253Areflectorid%253D0000000000000000000000%253Alastupd%253D1657553061915',
    'com.wm.reflector': '"reflectorid:0000000000000000000000@lastupd:1657553501450@firstcreate:1657553050889"',
    'auth': 'MTAyOTYyMDE4iuXLjWxh0mezZct6vG1mZ4wY6TQw3O%2BIEPH4NvDUyfhmu69uFbbN7n5Ot9VlhfxBLpViBXYhsdueTzHZvs4wQ6kPF6QDAwmzlx0pLYyOU0rtymbnequDBQLbnVN3KRJI767wuZloTfhm7Wk2KcjygjFwIZIekXC4wlSRgDWHtlx2%2B7u9g5NTsQKQz7FjX7I1EGqBWHzo5aIFd840Vx0asLvXA71KfiseySlLrBNPmhgUMk70P8glgOEpLOprhDfMM%2FFHGZ2dCNmxWrdkwqEKro2YwNEhLNUXBJiHjR181XWEdsdURjqSmrNlL12kr9b5bVWucTpKvbLLYqopaNh%2BgMUjRmku4Mft2op72ZGvimBOMeROjxWV0iYNj7aJ1x%2BzjCy5WWSQ4FE4inBBtr0Bo0jyrOXbKKhH072NS%2FW0j%2FU%3D',
    'ACID': 'a24c281a-1f90-4388-9f48-31e6845bed35',
    'hasACID': 'true',
    'assortmentStoreId': '3081',
    'hasLocData': '1',
    'helpgql': '1',
    'xptwg': '4235111632:1423B1873CDF6C0:342F2AE:564BAE52:5BF31DF4:A280DF5:',
    'akavpau_p2': '1657554102~id=6fbfd710b5fb2c3f267671836a21ec64',
    'TBV': '7',
    '_astc': '659a26f43e57a7d07a8c8bdd26c6b359',
    'adblocked': 'false',
    'pxcts': '849193ee-012d-11ed-8c9e-5a504d7a4563',
    '_pxvid': '836e203e-012d-11ed-ab53-626a69424550',
    'bm_sv': 'EFB76C439E0BFA2E7F8687F86EAECD61~YAAQluXKF9rngt2BAQAANZ7c7RBjHSuRhZ3PpiZhjaxEsgd657ef8/UgREPshx0Cz2F8DS+VuFuaNl9E3dUTTL2JDpsOkhKl5URBoGmA51ygaSk4uHNzIwyHfifRojFGEVjBWOZRFsEH0YIGLyYs1T733sNKaMJVsUUoTvKT56mIorJz4/U0MOGJ5dCRIACqtWmTzwPfjaLdu6aysln026JlI3w+qZWG87OWlVERzynoD1CGduLg9C+5ygeQV++lLQ==~1',
    'locDataV3': 'eyJpc0RlZmF1bHRlZCI6ZmFsc2UsImlzRXhwbGljaXQiOmZhbHNlLCJpbnRlbnQiOiJTSElQUElORyIsInBpY2t1cCI6W3siYnVJZCI6IjAiLCJub2RlSWQiOiIzMDgxIiwiZGlzcGxheU5hbWUiOiJTYWNyYW1lbnRvIFN1cGVyY2VudGVyIiwibm9kZVR5cGUiOiJTVE9SRSIsImFkZHJlc3MiOnsicG9zdGFsQ29kZSI6Ijk1ODI5IiwiYWRkcmVzc0xpbmUxIjoiODkxNSBHZXJiZXIgUm9hZCIsImNpdHkiOiJTYWNyYW1lbnRvIiwic3RhdGUiOiJDQSIsImNvdW50cnkiOiJVUyIsInBvc3RhbENvZGU5IjoiOTU4MjktMDAwMCJ9LCJnZW9Qb2ludCI6eyJsYXRpdHVkZSI6MzguNDgyNjc3LCJsb25naXR1ZGUiOi0xMjEuMzY5MDI2fSwiaXNHbGFzc0VuYWJsZWQiOnRydWUsInNjaGVkdWxlZEVuYWJsZWQiOnRydWUsInVuU2NoZWR1bGVkRW5hYmxlZCI6dHJ1ZSwiaHViTm9kZUlkIjoiMzA4MSIsInN0b3JlSHJzIjoiMDY6MDAtMjM6MDAiLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6WyJQSUNLVVBfQ1VSQlNJREUiLCJQSUNLVVBfSU5TVE9SRSIsIlBJQ0tVUF9TUEVDSUFMX0VWRU5UIl19XSwic2hpcHBpbmdBZGRyZXNzIjp7ImxhdGl0dWRlIjozOC40NzQ0LCJsb25naXR1ZGUiOi0xMjEuMzQzNywicG9zdGFsQ29kZSI6Ijk1ODI5IiwiY2l0eSI6IlNhY3JhbWVudG8iLCJzdGF0ZSI6IkNBIiwiY291bnRyeUNvZGUiOiJVU0EiLCJnaWZ0QWRkcmVzcyI6ZmFsc2V9LCJhc3NvcnRtZW50Ijp7Im5vZGVJZCI6IjMwODEiLCJkaXNwbGF5TmFtZSI6IlNhY3JhbWVudG8gU3VwZXJjZW50ZXIiLCJhY2Nlc3NQb2ludHMiOm51bGwsInN1cHBvcnRlZEFjY2Vzc1R5cGVzIjpbXSwiaW50ZW50IjoiUElDS1VQIiwic2NoZWR1bGVFbmFibGVkIjpmYWxzZX0sImRlbGl2ZXJ5Ijp7ImJ1SWQiOiIwIiwibm9kZUlkIjoiMzA4MSIsImRpc3BsYXlOYW1lIjoiU2FjcmFtZW50byBTdXBlcmNlbnRlciIsIm5vZGVUeXBlIjoiU1RPUkUiLCJhZGRyZXNzIjp7InBvc3RhbENvZGUiOiI5NTgyOSIsImFkZHJlc3NMaW5lMSI6Ijg5MTUgR2VyYmVyIFJvYWQiLCJjaXR5IjoiU2FjcmFtZW50byIsInN0YXRlIjoiQ0EiLCJjb3VudHJ5IjoiVVMiLCJwb3N0YWxDb2RlOSI6Ijk1ODI5LTAwMDAifSwiZ2VvUG9pbnQiOnsibGF0aXR1ZGUiOjM4LjQ4MjY3NywibG9uZ2l0dWRlIjotMTIxLjM2OTAyNn0sImlzR2xhc3NFbmFibGVkIjp0cnVlLCJzY2hlZHVsZWRFbmFibGVkIjp0cnVlLCJ1blNjaGVkdWxlZEVuYWJsZWQiOnRydWUsImFjY2Vzc1BvaW50cyI6W3siYWNjZXNzVHlwZSI6IkRFTElWRVJZX0FERFJFU1MifV0sImh1Yk5vZGVJZCI6IjMwODEiLCJpc0V4cHJlc3NEZWxpdmVyeU9ubHkiOmZhbHNlLCJzdXBwb3J0ZWRBY2Nlc3NUeXBlcyI6WyJERUxJVkVSWV9BRERSRVNTIl19LCJpbnN0b3JlIjpmYWxzZSwicmVmcmVzaEF0IjoxNjU3NTc0NjUzOTE4LCJ2YWxpZGF0ZUtleSI6InByb2Q6djI6YTI0YzI4MWEtMWY5MC00Mzg4LTlmNDgtMzFlNjg0NWJlZDM1In0%3D',
    'locGuestData': 'eyJpbnRlbnQiOiJTSElQUElORyIsImlzRXhwbGljaXQiOmZhbHNlLCJzdG9yZUludGVudCI6IlBJQ0tVUCIsIm1lcmdlRmxhZyI6ZmFsc2UsImlzRGVmYXVsdGVkIjpmYWxzZSwicGlja3VwIjp7Im5vZGVJZCI6IjMwODEiLCJ0aW1lc3RhbXAiOjE2NTc1NTMwNTM5MDd9LCJwb3N0YWxDb2RlIjp7InRpbWVzdGFtcCI6MTY1NzU1MzA1MzkwNywiYmFzZSI6Ijk1ODI5In0sInZhbGlkYXRlS2V5IjoicHJvZDp2MjphMjRjMjgxYS0xZjkwLTQzODgtOWY0OC0zMWU2ODQ1YmVkMzUifQ%3D%3D',
    'tb_sw_supported': 'false',
    '_px3': 'b96dfc6014fef380ad61cba68edbb676a83bf96c08ace0fda65c124c5bb282f0:BcNRGhHVyi6rRnjCpbHQUZk2ba5rJozuHoI9cpD/ZshcLZGLfp6RPy9RgdHILyBZDt2du2mgnfsF9NaCeZ5Dqw==:1000:HpsN85rY5JiFHvRBWBDAyFZMrE2ZpvUERcQXxcJYT+PI1Gi4icLg46Syz8Qftk9WEMmKyX2PhdUKiKQhL/qM+8ahFkevvMzmt8m3dmgCUxKjX6Mx06qXFDkRhQ7F1CzxjWv5s2kSsoKKcKpX7+eS1iGuJTYupe7ul2kJoevEo4PfHCvshjOslShXYV68jvpYi12bj5ns3F3KZVla4cJt7A==',
    'bm_mi': '66E015F2E445D4C81204D96C3D0C3D23~YAAQluXKF7Lngt2BAQAAOorc7RDr+Aekmj6zMgMRnKrKdc/NMpos2kYtJ9j4jzVaZDGgfmxmZgC6bar/eW1FwpJGK0C70OHLBmE1l0qosu6sU8q+Jo0qkyHyB00hfNQPuCECBqJgVpraqpDvbQHJRMudMM8gZWovGhuVEVEhduzRBDxYFdln4IQL0M4PNGxUdwZNG99tZjKuReZsdlhdJbnMbGXOrqsTRFtT3B3G45TRVScdeOvZb1uBzdThLJ6OZjK1bra2xFwcmn6ZKJba5dawbMtKnGJabXMSdQgjY6BQO6tGAOPA+ylMwO9Ga3XhmxuFt4C4xbIQlR6acolnQtCN3HFBek9M6+FEtJdXSWk7rnTb5GKuZPDM5XWRiUyQsoxeAPwoi9WdpUEhJXSTwyU=~1',
    'QuantumMetricSessionID': '09e77a0583174de68d659cee4b8e3db3',
    'QuantumMetricUserID': 'efd78f6a336a0b4eaa53debc222de35f',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    'Accept': 'application/json',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.walmart.com/ip/Allegra-Adult-24HR-Gelcaps-24-Ct-180-mg-Allergy-Relief/43819800',
    'x-o-segment': 'oaoh',
    'x-o-platform-version': 'main-1.5.1-fda6c9',
    'x-o-correlation-id': 'gevwerO9jaBLQSLIYQU6Vaqv0gkPbvW8_FEg',
    'wm_qos.correlation_id': 'gevwerO9jaBLQSLIYQU6Vaqv0gkPbvW8_FEg',
    'WM_MP': 'true',
    'x-o-ccm': 'server',
    'x-o-gql-query': 'query AdV2',
    'X-APOLLO-OPERATION-NAME': 'AdV2',
    'x-latency-trace': '1',
    'x-enable-server-timing': '1',
    'traceparent': 'gevwerO9jaBLQSLIYQU6Vaqv0gkPbvW8_FEg',
    'WM_PAGE_URL': 'https://www.walmart.com/ip/Allegra-Adult-24HR-Gelcaps-24-Ct-180-mg-Allergy-Relief/43819800',
    'DEVICE_PROFILE_REF_ID': 'uJjWx6XzZhYBFtUjd8BJmPf1j6eE5EV3woob',
    'x-o-platform': 'rweb',
    'x-o-bu': 'WALMART-US',
    'x-o-mart': 'B2C',
    'Origin': 'https://www.walmart.com',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
}

json_data = {
    'query': 'query AdV2( $platform:Platform! $pageId:String! $pageType:PageType! $tenant:String! $moduleType:ModuleType! $pageContext:PageContextIn $locationContext:LocationContextIn $moduleConfigs:JSON $adsContext:AdsContextIn $adRequestComposite:AdRequestCompositeIn ){adV2( platform:$platform pageId:$pageId pageType:$pageType tenant:$tenant moduleType:$moduleType locationContext:$locationContext pageContext:$pageContext moduleConfigs:$moduleConfigs adsContext:$adsContext adRequestComposite:$adRequestComposite ){status adContent{type data{__typename...AdDataDisplayAdFragment __typename...AdDataSponsoredProductsFragment}}}}fragment AdDataDisplayAdFragment on AdData{...on DisplayAd{json status}}fragment AdDataSponsoredProductsFragment on AdData{...on SponsoredProducts{adUuid adExpInfo moduleInfo products{...ProductFragment}}}fragment ProductFragment on Product{usItemId offerId badges{flags{__typename...on BaseBadge{id text key query type}...on PreviouslyPurchasedBadge{id text key lastBoughtOn numBought criteria{name value}}}labels{__typename...on BaseBadge{id text key}...on PreviouslyPurchasedBadge{id text key lastBoughtOn numBought}}tags{__typename...on BaseBadge{id text key}}}priceInfo{priceDisplayCodes{rollback reducedPrice eligibleForAssociateDiscount clearance strikethrough submapType priceDisplayCondition unitOfMeasure pricePerUnitUom}currentPrice{price priceString}wasPrice{price priceString}priceRange{minPrice maxPrice priceString}unitPrice{price priceString}}showOptions sponsoredProduct{spQs clickBeacon spTags}canonicalUrl numberOfReviews averageRating availabilityStatus imageInfo{thumbnailUrl allImages{id url}}name fulfillmentBadge classType type showAtc p13nData{predictedQuantity flags{PREVIOUSLY_PURCHASED{text}CUSTOMERS_PICK{text}}labels{PREVIOUSLY_PURCHASED{text}CUSTOMERS_PICK{text}}}}',
    'variables': {
        'adRequestComposite': {},
        'adsContext': {
            'locationContext': {
                'zipCode': '95829',
                'stateCode': 'CA',
                'storeId': '3081',
                'pickupStore': '3081',
                'deliveryStore': '3081',
                'intent': 'SHIPPING',
                'incatchment': True,
            },
            'itemId': '43819800',
            'categoryId': '976760_3771182_1291980',
            'categoryName': 'Health/Allergy and Sinus/Fexofenadine',
            'brand': 'Allegra',
            'productName': 'Allegra Adult 24HR Gelcaps (24 Ct, 180 mg), Allergy Relief',
            'productTypeId': '1278',
            'normKeyword': '',
            'verticalId': '',
            'dedupeList': [],
        },
        'pageContext': {
            'itemContext': {
                'itemId': '43819800',
                'categoryPath': '0:976760:3771182:1291980',
                'categoryPathName': 'Home Page/Health/Allergy and Sinus/Fexofenadine',
                'name': 'Allegra Adult 24HR Gelcaps (24 Ct, 180 mg), Allergy Relief',
                'brand': 'Allegra',
                'partTypeID': '',
                'manufactureNumber': '553243830',
                'aaiaBrand': '',
                'tireSize': '',
                'speedRating': '',
                'loadIndex': '',
                'viscosity': '',
                'type': 'VARIANT',
                'productTypeId': '1278',
            },
        },
        'pageId': '43819800',
        'pageType': 'ITEM',
        'platform': 'DESKTOP',
        'tenant': 'WM_GLASS',
        'locationContext': {
            'storeId': '3081',
            'stateCode': 'CA',
            'zipCode': '95829',
        },
        'moduleConfigs': {
            'moduleLocation': 'middle',
            'lazy': '1500',
        },
        'moduleType': 'SponsoredProductCarousel',
    },
}

response = requests.post('https://www.walmart.com/orchestra/home/graphql', cookies=cookies, headers=headers, json=json_data)
2uluyalo

2uluyalo2#

对于Walmart scraping,您可以使用来自第三方API SerpApi的Walmart Search Engine Results API替代品。这是一个免费的付费API。
它将绕过来自Google和其他搜索引擎的屏蔽(包括CAPTCHA),并且不需要创建解析器和维护它。
每家商店都有自己的Location cookies。你可以在scrape沃尔玛搜索特定商店的博客文章中了解更多。
您可以使用JSON list of supported Walmart Stores从任何商店获取信息(总共4,640个):

# https://serpapi.com/walmart-stores
with open("walmart-stores.json", "r") as file:
  walmart_stores = json.load(file)

为了收集所有页面的信息,你需要使用分页,在我们的例子中使用while循环:

if 'next' in results.get('serpapi_pagination', {}):
    search.params_dict.update(dict(parse_qsl(urlsplit(results.get('serpapi_pagination', {}).get('next')).query)))
else:
    break

检查在线IDE中的代码。

from serpapi import GoogleSearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json

# https://serpapi.com/walmart-stores
with open("walmart-stores.json", "r") as file:
  walmart_stores = json.load(file)
data = []
for store in walmart_stores[:2]:          # get data from first two stores, remove list slicing to get them all
    params = {
        "api_key": os.getenv("API_KEY"),  # serpapi key, https://serpapi.com/manage-api-key 
        "engine": "walmart",              # serpapi parser engine
        "query": "Coffee",                # search query
        "store_id": store["store_id"]     # store ID to filter the products by the specific store only
    }
    search = GoogleSearch(params)         # where data extraction happens
    
    page_num = 0
    
    while True:
        results = search.get_dict()       # JSON -> Python dictionary
    
        for organic_result in results.get("organic_results", []):
            title = organic_result.get("title")
            price = organic_result.get("primary_offer").get("offer_price")
            product_page_url = organic_result.get("product_page_url")
            description = organic_result.get("description")
            thumbnail = organic_result.get("thumbnail")
            rating = organic_result.get("rating")
            reviews = organic_result.get("reviews")
    
            data.append({
                "page": page_num,
                "title": title,
                "price": price,
                "description": description,
                "thumbnail": thumbnail,
                "rating": rating,
                "reviews": reviews,
                "product_page_url": product_page_url
            })
                        
        page_num += 1
        print(page_num)
        
        if 'next' in results.get('serpapi_pagination', {}):
            search.params_dict.update(dict(parse_qsl(urlsplit(results.get('serpapi_pagination', {}).get('next')).query)))
        else:
            break
        
print(json.dumps(data, indent=2))

输出示例:

[
    {
    "page": 16,
    "title": "McCafe Breakfast Blend Ground Coffee, Medium Roast, 30 oz Canister",
    "price": 12.98,
    "description": "mccafe",
    "thumbnail": "https://i5.walmartimages.com/asr/abe0e725-c650-4fd7-8efa-6869f9f5a716.ffdb20396c65dd75ca51af0f7737e299.jpeg?odnHeight=180&odnWidth=180&odnBg=ffffff",
    "rating": 4.8,
    "reviews": 437,
    "product_page_url": "https://www.walmart.com/ip/McCafe-Breakfast-Blend-Ground-Coffee-Medium-Roast-30-oz-Canister/171903071"
  },
  {
    "page": 16,
    "title": "Coffee",
    "price": 20.97,
    "description": "Art-inspired glass cutting boards deliver gorgeous kitchen art and will prove an essential item for food prep! Durable glass surface with smooth beveled glass edges is suitable for ceramic knife use.",
    "thumbnail": "https://i5.walmartimages.com/asr/87f3d528-8e63-4e26-99bc-dfc5dd5d8a93.ff099b1c5c7d911adb4719a006e0be6b.jpeg?odnHeight=180&odnWidth=180&odnBg=ffffff",
    "rating": 0,
    "reviews": 0,
    "product_page_url": "https://www.walmart.com/ip/Coffee/741199677"
  },
  other results...
]

有一篇SerpApi Demo Project: Walmart Coffee Exploratory Data Analysis博客文章展示了使用分页(在每个商店内部)从500家商店中提取咖啡列表数据。
免责声明我为SerpApi工作。

相关问题