python 从Google图像搜索结果中抓取网站URL

yqkkidmi  于 2023-06-04  发布在  Python
关注(0)|答案(1)|浏览(176)

搜索链接:https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR

from bs4 import BeautifulSoup

    search_link = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'

    all_links=[]
    for i in range(1,10):
        url= search_link.format(i)
        #print("url: " +url)
        r = requests.get(url)
        c = r.content
        soup = BeautifulSoup(c, 'html.parser')
        all = soup.find_all('a', {'class': 'ArticleTeaserSearchResultItem_link'}, href=True)
        for item in all:
            print(item)
            print(item['href'])
            all_links.append(item['href'])

    print(all_links)

我从网上找到了一些代码,但它不工作。在我运行代码之后,列表是空的。有人知道这件事吗?非常感谢。

wsxa1bj1

wsxa1bj11#

你不指定标题,你的请求给予空页.还有一件事,没有-ArticleTeaserSearchResultItem_link类。你可以试试这个

from bs4 import BeautifulSoup
import requests

url = 'https://www.google.com/search?tbs=sbi:AMhZZisiaoy1wggx2tclyVPl7ZElZuKcwjhfiYUHVFXr34pc55jcoqk8gusdeUW0_1iysA9-fbuy5vl4ZFPZl-46EcoOGra04IDQDSCBTZpGaaUeO7tw4xLQ2q_159_1GsCOjhyIPi5smZmTTzjezzRsekOALA0u-5GuinrW72FIUSfyc9SsLBqw8DH88ATdRnSefjF3bC9di_1las1jmHga4lAPcWRENSwiSyEMfvNO_1Bh5B8pUtzlXNL4MTx8XdRDUCyT8mt0vqYlG1lymcrV_15Ql6OyfgK9r4CLM0YZ3awnw2kiH60Ft6q1mySWtoXULycNbdgbGPtg1s214kr5G2r_1TnFmeEYTQObQ&hl=en-KR'
headers = {
  'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
all_links=[]
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
for href in soup.find('div', {'id': 'search'}).find_all('a'):
    if href.get('href').startswith('https://'):
        all_links.append(href.get('href'))

print(list(set(all_links)))

输出:

[
    "https://www.quadrantkindercentra.nl/pedagogisch-werkplan-kdv-villa-kakelbont-2020/?w=5.6.3254479.1.27.32.red+iphone+6s",
    "https://itstechprice.com/apple-iphone-6-price-in-nigeria/",
    "https://olist.ng/mobile_phones-apple-iphone_6",
    "https://naijaprice.com/apple-iphone-prices-in-nigeria/",
    "https://www.walmart.com/browse/cell-phones/apple-ios-prepaid-phones/1105910_4527935_1072335_1231295_1231296",
    "https://www.amazon.in/VcareGadGets-Apple-iPhone-Shining-Gloss/dp/B07FND9S6M", 
    "https://www.amazon.in/VCARE-GADGETS-Marble-White-iPhone/dp/B07P8CQZNY"
]

相关问题