python 不能找到或打印链接形式亚马逊使用xpath,但我能够做与beautifulsoup

xxhby3vn  于 2023-03-16  发布在  Python
关注(0)|答案(1)|浏览(101)

这是我尝试过很多方法的python脚本,但由于我是xpath新手,它不起作用

from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep

def AmzonParser(url):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 
    (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
    page = requests.get(url,headers=headers)
    while True:
        sleep(3)
        try:
            doc = html.fromstring(page.content)
            XPATH_NAME = '//h1[@id="title"]//text()'
            XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or 
            contains(@id,"saleprice")]/text()'
            XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or 
            contains(text(),"M.R.P") or contains(text(),"Price")]/following-
            sibling::td/text()'
            XPATH_CATEGORY = '//a[@class="a-link-normal a-color-
             tertiary"]//text()'
            XPATH_AVAILABILITY = '//div[@id="availability"]/span/text()'
            XPATH_DESCRIPTION = '///*[@id="productDescription"]/p/text()'
            XPATH_IMAGE = '//*[@id="main-image-
            container"]/ul/li[5]/span/span/div/img/src'

            RAW_NAME = doc.xpath(XPATH_NAME)
            RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
            RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
            RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
            RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
            RAW_DESCRIPTION = doc.xpath(XPATH_DESCRIPTION)
            RAW_IMAGE = doc.xpath(XPATH_IMAGE)

            NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
            SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if 
            RAW_SALE_PRICE else None
            CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if 
            RAW_CATEGORY else None
            ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if 
            RAW_ORIGINAL_PRICE else None
            AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY 
            else None
            DESCRIPTION = ''.join(RAW_DESCRIPTION).strip() if RAW_DESCRIPTION 
            else None
            IMAGE = ''.join(RAW_IMAGE) if RAW_IMAGE else None

            if not ORIGINAL_PRICE:
                ORIGINAL_PRICE = SALE_PRICE

            if page.status_code!=200:
                raise ValueError('captha')
            data = {
                    'NAME':NAME,
                    'SALE_PRICE':SALE_PRICE,
                    'CATEGORY':CATEGORY,
                    'ORIGINAL_PRICE':ORIGINAL_PRICE,
                    'AVAILABILITY':AVAILABILITY,
                    'URL':url,
                    'DESCRIPTION':DESCRIPTION,
                    'IMAGE':IMAGE,
                    }

            return data
        except Exception as e:
            print e

def ReadAsin():
    # AsinList = 
    csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
    AsinList = ['B008HDREZ6',]
    extracted_data = []
    for i in AsinList:
        url = "http://www.amazon.com/dp/"+i
        print "Processing: "+url
        extracted_data.append(AmzonParser(url))
        sleep(5)
        f=open('data.json','w')
        json.dump(extracted_data,f,indent=4)

 if __name__ == "__main__":
     ReadAsin()

我无法获取图像的链接
下面是html

<div class="imgTagWrapper" style="height: 296px;">
  <img src="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg" class="a-dynamic-image a-stretch-vertical" id="" style="max-height: 296px; max-width: 204.282px;" data-old-hires="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg" data-a-manual-replacement="true">
</div>
dy2hfwbg

dy2hfwbg1#

Page使用JavaScript将big image放在这个标签中。但是lxmlbeautifulsoup不能运行JavaScript。
使用lxml/beautifulsoup,您只能在左侧使用'//div[@id="altImages"]//img/@src'获得小图像。
你可以在<script>标签中找到一些网址。
代码使用data["colorImages"] =查找<script>,并将数据转换为JSON字符串,然后将其转换为Python的字典,这样就可以轻松地获得许多不同大小的图像的URL。

import requests
from lxml import html
import json

url = "http://www.amazon.com/dp/B008HDREZ6"

headers = {
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}

response = requests.get(url, headers=headers)
doc = html.fromstring(response.content)

print('--- small ---')
XPATH_IMAGE = '//div[@id="altImages"]//img/@src'
RAW_IMAGE = doc.xpath(XPATH_IMAGE)
print('\n'.join(RAW_IMAGE[:-1]))

print('--- scripts ---')
XPATH_SCRIPTS = '//script'
RAW_SCRIPTS = doc.xpath(XPATH_SCRIPTS)
data = ''
for script in RAW_SCRIPTS:
    text = script.text 
    if 'data["colorImages"]' in text:
        for line in text.splitlines():
            if 'data["colorImages"]' in line:
                #print(line)
                data = line

print('--- data ---')
data = data[24:-1]
data = json.loads(data)

print('keys:', data.keys())
print('keys:', data['Silver'][0].keys())
print('keys:', data['White'][0].keys())

for item in data['Silver']:
    print('variant:', item['variant'])
    print('main:', item['main'])
    print('large:', item['large'])
    print('hiRes:', item['hiRes'])
    print('thumb:', item['thumb'])
    print('-----')

拇指:

--- small ---
https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg

JavaScript中的数据:

--- data ---
keys: dict_keys(['Silver', 'White'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])

variant: MAIN
main: {'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX355_.jpg': ['219', '355'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX522_.jpg': ['323', '522'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX450_.jpg': ['278', '450'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX466_.jpg': ['288', '466'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX425_.jpg': ['263', '425']}
large: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
-----
variant: PT01
main: {'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY550_.jpg': ['550', '380'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY355_.jpg': ['355', '245'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY679_.jpg': ['679', '469'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg': ['450', '311'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY606_.jpg': ['606', '419']}
large: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
-----
variant: PT02
main: {'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX466_.jpg': ['311', '466'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX522_.jpg': ['348', '522'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX450_.jpg': ['300', '450'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX425_.jpg': ['283', '425'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX355_.jpg': ['237', '355']}
large: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
-----

相关问题