python 尝试创建pyton脚本返回tonie长度等

2q5ifsrm  于 2024-01-05  发布在  Python
关注(0)|答案(1)|浏览(97)

我试图创建一个脚本,以返回所有tonies在https://tonies.com/en-gb/tonies/的持续时间。我也想返回他们每个人的成本,但挣扎。我也看了通过 selenium 脚本,但卡住了cookie接受这是一个影子dom。我想我可能会使这过于复杂。我是编程和Python新手。任何建议都表示赞赏。脚本在其当前形式似乎只刮到了前21件

  1. import re
  2. import requests
  3. from bs4 import BeautifulSoup
  4. def get_tonie_info(tonie_url):
  5. response = requests.get(tonie_url)
  6. soup = BeautifulSoup(response.text, 'html.parser')
  7. script_tags = soup.find_all('script')
  8. tonie_info = {'url': tonie_url, 'durations': []}
  9. for script_tag in script_tags:
  10. script_content = script_tag.string
  11. if script_content and 'runTime' in script_content:
  12. matches = re.findall(r'"runTime":\s*(\d+)', script_content)
  13. if matches:
  14. tonie_info['durations'] = list(map(int, matches))
  15. return tonie_info
  16. def scrape_tonies():
  17. all_tonie_info = []
  18. base_url = "https://tonies.com/en-gb/tonies/?page="
  19. page_number = 9 # Only scrape data from page 9
  20. current_url = base_url + str(page_number)
  21. response = requests.get(current_url)
  22. soup = BeautifulSoup(response.text, 'html.parser')
  23. tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')
  24. for tonie_link in tonie_links:
  25. tonie_url = "https://tonies.com" + tonie_link['href']
  26. tonie_info = get_tonie_info(tonie_url)
  27. if tonie_info['durations']:
  28. tonie_info['name'] = tonie_link.text.strip()
  29. tonie_info['duration'] = tonie_info['durations'][-1]
  30. all_tonie_info.append(tonie_info)
  31. else:
  32. print(f"Could not retrieve information for {tonie_url}")
  33. return all_tonie_info
  34. if __name__ == "__main__":
  35. tonies_info = scrape_tonies()
  36. for index, tonie_info in enumerate(tonies_info, start=1):
  37. print(f"Toni {index} Name: {tonie_info['name']}")
  38. print(f" URL: {tonie_info['url']}")
  39. print(f" Duration: {tonie_info['duration']}")

字符串

r7xajy2e

r7xajy2e1#

您可以尝试以JSON格式收集托尼的数据,然后进行 * 后处理 *:

  1. import json
  2. url = "https://tonies.com/en-gb/tonies/"
  3. response = requests.get(url) # with optional headers
  4. soup = BeautifulSoup(response.text, "html.parser")
  5. data = (json.loads(soup.select_one("#__NEXT_DATA__").text)
  6. ["props"]["pageProps"]["page"]["productList"]["normalizedProducts"])
  7. use_keys = ["name", "price", "runTime"] # << ask for more if needed
  8. tonies = [
  9. {
  10. k: d.get(k) if k!="price" else d.get(k).get("amount")
  11. for k in use_keys
  12. } for d in data
  13. ]

字符串
输出量:

  1. # len(tonies) # 196
  2. print(json.dumps(tonies, indent=4))
  3. [
  4. {
  5. "name": "Chase",
  6. "price": 14.99,
  7. "runTime": 54
  8. },
  9. {
  10. "name": "Elmer and Friends Story Collection",
  11. "price": 14.99,
  12. "runTime": 62
  13. },
  14. {
  15. "name": "Frozen",
  16. "price": 14.99,
  17. "runTime": 24
  18. },
  19. ...
  20. ]

展开查看全部

相关问题