python playwright中的分页处理

nzk0hqpo 于 2023-11-16 发布在 Python

关注(0)|答案(1)|浏览(211)

我试图刮这个网站https://booking.com与剧作家和python，但我不知道如何通过多个结果页刮，我如何解决这个分页问题？我如何才能循环在最后的页码和 scrapy 在其他页面的数据这里是我的代码

from playwright.sync_api import sync_playwright
import pandas as pd

def main():
    with sync_playwright() as p:
        checkin_date = '2023-12-10'
        checkout_date = '2023-12-18'
        page_url= f'https://www.booking.com/searchresults.html?ss=Medina%2C+Saudi+Arabia&label=gog235jc-1DCAEoggI46AdIM1gDaMQBiAEBmAExuAEXyAEP2AED6AEB-AECiAIBqAIDuAL4_M2qBsACAdICJDIyNmY5NThlLTdkNjctNDg2Yi05ZDMzLWY3M2JhZmRkZDdhNtgCBOACAQ&aid=397594&lang=en-us&sb=1&src_elem=sb&src=index&dest_id=-3092186&dest_type=city&checkin={checkin_date}&checkout={checkout_date}&group_adults=1&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()
        page.goto(page_url,timeout=6000000)

        hotels = page.locator('//div[@data-testid="property-card"]').all()
        print(f'There are: {len(hotels)} hotels.')

        hotels_list =[]
        for hotel in hotels:
            hotel_dict = {}
            hotel_dict['hotel'] = hotel.locator('//div[@data-testid="title"]').inner_text()
            hotel_dict['price'] = hotel.locator('//span[@data-testid="price-and-discounted-price"]').inner_text()
            hotel_dict['Nights'] = hotel.locator('//div[@data-testid="availability-rate-wrapper"]/div[1]/div[1]').inner_text()
            hotel_dict['tax'] = hotel.locator('//div[@data-testid="availability-rate-wrapper"]/div[1]/div[3]').inner_text()
            hotel_dict['score'] = hotel.locator('//div[@data-testid="review-score"]/div[1]').inner_text()
            hotel_dict['distance'] = hotel.locator('//span[@data-testid="distance"]').inner_text()
            hotel_dict['avg review'] = hotel.locator('//div[@data-testid="review-score"]/div[2]/div[1]').inner_text()
            hotel_dict['reviews count'] = hotel.locator('//div[@data-testid="review-score"]/div[2]/div[2]').inner_text().split()[0]

            hotels_list.append(hotel_dict)

        df = pd.DataFrame(hotels_list)
        df.to_excel('hotels_list.xlsx', index=False) 
        df.to_csv('hotels_list.csv', index=False) 
        browser.close()
        
if __name__ == '__main__':
    main()

字符串

python

来源：https://stackoverflow.com/questions/77482863/handling-pagination-in-python-playwright