selenium 如何使用Python从搜索结果中抓取Amazon上的所有页面

lf5gs5x2  于 2023-05-22  发布在  Python
关注(0)|答案(1)|浏览(199)

我正试图用Python从亚马逊上的搜索结果中抓取所有页面。但是,下面的代码只返回第1页上的列表。有谁能提供一些关于如何收集其他页面的建议吗?设置特定范围不起作用?

from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    return url

def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://amazon.com' + atag.get('href')

    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span', 'a-offscreen') else ''

    rating_element = item.find('span', {'class': 'a-icon-alt'})
    rating = rating_element.text.strip() if rating_element else ''

    review_count_element = item.find('span', {'class': 'a-size-base', 'dir': 'auto'})
    review_count = review_count_element.text.strip() if review_count_element else ''

    result = (description, price, rating, review_count, url)
    return result

def scrape_amazon(search_term):
    driver = webdriver.Firefox()
    records = []
    page = 1

    url = get_url(search_term)
    driver.get(url)
    time.sleep(2)  # Add a short delay to let the page load

    while True:
        # Scroll to the bottom of the page to load more items
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Add a short delay to let the page load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f"Error scraping item: {e}")

        # Check if there is a "Next" button on the page
        pagination_next = []
        for x in soup.find_all('a', {'class': 's-pagination-item' 's-pagination-button'}):
            pagination_next.append(x)
        print(pagination_next)
        if not any(a.get_text() == 'Next' for a in pagination_next):
            break  # Stop scraping if there are no more pages

        page += 1

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

# Get user input for the search term
search_term = 'ultrawide monitor'

# Scrape Amazon for the search term
df = scrape_amazon(search_term)

# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)

请列出无论如何,我可以使这个代码刮从搜索的所有网页。

vxqlmq5t

vxqlmq5t1#

您可以使用下面的更新的审查计数定位器和更改的逻辑,以打破循环时,下一步按钮没有显示,否则点击下一页和刮此外,我已经使用undetected-chromedriver的 selenium 和通过fake user agent,以避免机器人检测

import undetected_chromedriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as ExpectedConditions
import pandas as pd
import time
from fake_useragent import UserAgent

from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    return url

def scrape_records(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://amazon.com' + atag.get('href')

    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text.strip() if price_parent and price_parent.find('span',
                                                                                                        'a-offscreen') else ''

    rating_element = item.find('span', {'class': 'a-icon-alt'})
    rating = rating_element.text.strip() if rating_element else ''

    review_count_element = item.find('span', {'class': 'a-size-base s-underline-text'})
    review_count = review_count_element.text.strip() if review_count_element else ''

    result = (description, price, rating, review_count, url)
    return result

def scrape_amazon(search_term):
    ua = UserAgent()
    options = Options()
    options.add_argument(f"user-agent={ua.random}")
    driver = undetected_chromedriver.Chrome(options=options)
    url = get_url(search_term)
    driver.get(url)
    time.sleep(5)
    records = []
    while True:

        # Scroll to the bottom of the page to load more items
        # Add a short delay to let the page load
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            try:
                record = scrape_records(item)
                records.append(record)
            except Exception as e:
                print(f"Error scraping item: {e}")

        # Check if there is a "Next" button on the page
        try:
            nextButton = driver.find_element(By.XPATH, '//a[text()="Next"]')
            driver.execute_script("arguments[0].scrollIntoView();", nextButton)
            WebDriverWait(driver, 10).until(ExpectedConditions.element_to_be_clickable(nextButton))
            nextButton.click()
        except NoSuchElementException:
            print("Breaking as Last page Reached")
            break

    driver.close()

    # Process the records
    df = pd.DataFrame(records, columns=['Description', 'Price', 'Rating', 'Review Count', 'URL'])
    return df

# Get user input for the search term
search_term = 'ultrawide monitor'

# Scrape Amazon for the search term
df = scrape_amazon(search_term)

# Export DataFrame to Excel
df.to_excel('output.xlsx', index=False)

输出

相关问题