我正试图用Python从亚马逊上的搜索结果中抓取所有页面。但是,下面的代码只返回第1页上的列表。有谁能提供一些关于如何收集其他页面的建议吗?设置特定范围不起作用?
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
return url
def scrape_records(item):
atag = item.h2.a
description = atag.text.strip()
url = 'https://amazon.com' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span', 'a-offscreen') else ''
rating_element = item.find('span', {'class': 'a-icon-alt'})
rating = rating_element.text.strip() if rating_element else ''
review_count_element = item.find('span', {'class': 'a-size-base', 'dir': 'auto'})
review_count = review_count_element.text.strip() if review_count_element else ''
result = (description, price, rating, review_count, url)
return result
def scrape_amazon(search_term):
driver = webdriver.Firefox()
records = []
page = 1
url = get_url(search_term)
driver.get(url)
time.sleep(2) # Add a short delay to let the page load
while True:
# Scroll to the bottom of the page to load more items
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # Add a short delay to let the page load
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
try:
record = scrape_records(item)
records.append(record)
except Exception as e:
print(f"Error scraping item: {e}")
# Check if there is a "Next" button on the page
pagination_next = []
for x in soup.find_all('a', {'class': 's-pagination-item' 's-pagination-button'}):
pagination_next.append(x)
print(pagination_next)
if not any(a.get_text() == 'Next' for a in pagination_next):
break # Stop scraping if there are no more pages
page += 1
driver.close()
# Process the records
df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
return df
# Get user input for the search term
search_term = 'ultrawide monitor'
# Scrape Amazon for the search term
df = scrape_amazon(search_term)
# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)
请列出无论如何,我可以使这个代码刮从搜索的所有网页。
1条答案
按热度按时间vxqlmq5t1#
您可以使用下面的更新的审查计数定位器和更改的逻辑,以打破循环时,下一步按钮没有显示,否则点击下一页和刮此外,我已经使用undetected-chromedriver的 selenium 和通过fake user agent,以避免机器人检测
输出