如何刮Google Play商店使用scrapy

8tntrjer  于 2022-11-09  发布在  Go
关注(0)|答案(2)|浏览(279)

我试着用Scrapy抓取Google Play商店,默认情况下我只能得到50个链接,而我总共能看到257个链接。所以我也应用了request headerform request,但两种方法都失败了。这是我收到的错误。请看一下
2020-10-30 18:16:54 [抓取.蜘蛛中间件. http错误]信息:正在忽略响应:HTTP状态代码未处理或不允许
这里是目标URL,其中257列出持有https://play.google.com/store/search?q=quotes&c=apps,但默认情况下只得到50.代码如下所示,我尝试过.请帮助我

from scrapy import Spider
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser

class PlaySpider(Spider):
    name = 'play'
    allowed_domains = ['play.google.com']
    start_urls = ['https://play.google.com/store/search?q=quotes&c=apps']

    # def parse(self, response):
    #     data = {
    #         'f.req': '%5B9%2C1%2C1.25%2C%5Bnull%2C1350%2C2400%5D%2C%5Bnull%2C327%2C1344%5D%2C%5Btrue%2Ctrue%2Ctrue%2Ctrue%5D%2C%5Bfalse%2C2%2C2%5D%5D&',
    #         'at': 'AE2DSODV9YrtVLLv1YugtW097VJD%3A1604056672307&'
    #     }
    #     yield FormRequest(
    #         url='https://play.google.com/_/PlayStoreUi/browserinfo?f.sid=-3103376089553482051&bl=boq_playuiserver_20201027.06_p0&hl=en&authuser=0&soc-app=121&soc-platform=1&soc-device=1&_reqid=4962278&rt=j',
    #         formdata=data,
    #         callback=self.parse_play
    #     )
    #
    # def parse_play(self, response):
    #     open_in_browser(response)

    def parse(self, response):
        url = 'https://play.google.com/_/PlayStoreUi/browserinfo?f.sid=-3103376089553482051&bl=boq_playuiserver_20201027.06_p0&hl=en&authuser=0&soc-app=121&soc-platform=1&soc-device=1&_reqid=4962278&rt=j'
        headers = {
            'authority': 'play.google.com',
            'method': 'POST',
            'path': '/_/PlayStoreUi/browserinfo?f.sid=-3103376089553482051&bl=boq_playuiserver_20201027.06_p0&hl=en&authuser=0&soc-app=121&soc-platform=1&soc-device=1&_reqid=4962278&rt=j',
            'scheme': 'https',
            'accept': '*/*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9,mt;q=0.8,fr;q=0.7,ru;q=0.6,bn;q=0.5,de;q=0.4',
            'content-length': '182',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'cookie': 'SID=2we9KP-jDu8bZ3iag5AcctRssfi1KPyfUWFYpxI2W0TxFwyqOaoCBO3CvfCHuoK60oQS7w.; __Secure-3PSID=2we9KP-jDu8bZ3iag5AcctRssfi1KPyfUWFYpxI2W0TxFwyqO8JF9jd1Qit9bAylaNfesQ.; HSID=AL01amZ-pbltbWMV7; SSID=AfC24bLLuvHWYWazZ; APISID=UWuhW7qZn0Yg6zUk/A21wIyYSi2J4KvZtL; SAPISID=IAzttnMi1S3MdJDv/A4ybYudwcofhwA8gA; __Secure-3PAPISID=IAzttnMi1S3MdJDv/A4ybYudwcofhwA8gA; OTZ=5687668_32_32__32_; NID=204=fblQ_6pXpYwCNy6yN1zQ2EoRT9VaU0_WOpdIxFmAzAKtr0QKP4hwzIj8yU0s2AyeWTWCc9m7tWkeVjTwKXgp4e4cLKB7UGNyuUIJAbmirj9hT3hXFQ4wUvXa-NCgJIJ-38ZiAyfOJSZsVJEVcWodA1nUQzPfaH06WU2SIlwd1M8qK-GEp1MD569Xth3e3BeB8qt9-vIVSibpZc_aVbOKp38p4yshqvBv5LbPajmcuKkP-1QsY3Uwe_b546Ei60KN8eJ44guVRZ6dBZI; 1P_JAR=2020-10-30-11; PLAY_ACTIVE_ACCOUNT=ICrt_XL61NBE_S0rhk8RpG0k65e0XwQVdDlvB6kxiQ8=suvashish.halder@gmail.com; OGPC=19009731-1:19008539-5:19010599-2:19015969-1:19011552-1:; OGP=-19009731:-19015969:-19011552:-19010599:-19008539:; SIDCC=AJi4QfE5wzww8DWa6SMq2omQvpVRaI_7hUuhZGfaOHbga3NwN7OcIiMv9ILYSMgKrY1i4pNwKA; __Secure-3PSIDCC=AJi4QfFU93FMessFFLviRRPm3buykQeAylLNYGhgFVIrdIde1InWntlWllI0sA3h6dr6EDMgmGQ',
            'origin': 'https://play.google.com',
            'referer': 'https://play.google.com',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4287.0 Safari/537.36 Edg/88.0.673.0',
            'x-same-domain': '1',
        }

        yield Request(url=url,
                      method='POST',
                      dont_filter=True,
                      headers=headers,
                      callback=self.parse_play)

    def parse_play(self, response):
        open_in_browser(response)
jv2fixgn

jv2fixgn1#

你得到50,因为内容是通过JavaScript动态加载的。要看到它,请在浏览器上禁用JavaScript。

brccelvz

brccelvz2#

您可以使用SerpApi的Google Play Apps Store API来使用它,SerpApi是一个付费的API,有一个免费的计划。
不同之处在于,您不必从头开始创建解析器,维护它,弄清楚如何实现分页,绕过Google的块,以及如何在需要时缩放它。Check out the playground
在线IDE中的代码和示例:

from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json

params = {
    "api_key": "SerpApi key",     # your serpapi API key
    "engine": "google_play",      # search engine
    "hl": "en",                   # language
    "store": "apps",              # store filter: apps, books, movies
    "gl": "us",                   # country of the search
    "q": "quotes"                 # search query
}

search = GoogleSearch(params)   # where data extraction happens

# temporary store the data

apps_data = []

# page countter

index = 0

apps_is_present = True
while apps_is_present:
    results = search.get_dict()  # JSON -> Python dict (actual data)

    # update page number
    index += 1

    for result in results["organic_results"]:
        for app in result["items"]:
            apps_data.append({
                "page": index,
                "title": app.get("title"),
                "link": app.get("link"),
                "product_id": app.get("product_id"),
                "description": app.get("description"),
                "rating": app.get("rating")
            })

    # if next page is there, grab it and pass to GoogleSearch()
    # otherwise, stop.
    if "next" in results.get("serpapi_pagination", []):
        search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
    else:
        apps_is_present = False

print(json.dumps(apps_data, indent=2, ensure_ascii=False))

输出量:

]
  {
    "page": 1,
    "title": "Motivation - Daily quotes",
    "link": "https://play.google.com/store/apps/details?id=com.hrd.motivation",
    "product_id": "com.hrd.motivation",
    "description": "Positive quote reminders of the day",
    "rating": 4.9
  }, ... other results
  {
    "page": 5,
    "title": "Dirty Mind Quotes",
    "link": "https://play.google.com/store/apps/details?id=moon.dirty.quotes.dirtyquotes",
    "product_id": "moon.dirty.quotes.dirtyquotes",
    "description": "Latest Dirty quotes and flirty quotes. #Dirty Quotes",
    "rating": 3.8
  }
]

目前,我还没有一个实施的DIY解决方案,一旦我完成它,我会更新这个答案。
免责声明,我为SerpApi工作。

相关问题