NodeJS 如何使用Cheerio进行网页抓取

c3frrgcw  于 2023-04-29  发布在  Node.js
关注(0)|答案(1)|浏览(173)

我试过这个密码

const cheerio = require("cheerio");
const axios = require('axios');

async function getProducts() {
  try{
    
    const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
    const html = await res.data;
    const $ = cheerio.load(html);

    const products = [];
    $('ul[data-testid]').each((i, el) => {
      const title = $(el).find('a[data-testid="product_name"]').text().trim();
      const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
      products.push({ title, price });
    });

    console.log(products);
  }catch(err){
    console.log(err)
  }
};

getProducts();

我需要包含标题和价格的产品列表数组,但此代码返回我空数组。如何获得这些细节?示例链接:https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup .
亚马逊的工作,但这个carefour网站不工作的网页抓取!

const cheerio = require("cheerio");
const axios = require('axios');

async function getProducts() {
  try{
    
    const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
    const html = await res.data;
    const $ = cheerio.load(html);

    const products = [];
    $('ul[data-testid]').each((i, el) => {
      const title = $(el).find('a[data-testid="product_name"]').text().trim();
      const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
      products.push({ title, price });
    });

    console.log(products);
  }catch(err){
    console.log(err)
  }
};

getProducts();

尝试了这个,并期望使用cheerio- Nodejs获得产品的详细信息和价格

xt0899hw

xt0899hw1#

回答一个老问题,但如果你不想使用puppeteer,你仍然可以从这个网站获得数据 (和其他人) 如果他们在脚本标签中的JSON格式的每个页面上都有数据,这个网站在<script id="__NEXT_DATA__">...</script>中有它,
对于多个页面上的产品,请查看URL,在本例中,当显示的产品太多时,会出现一个加载更多按钮,单击该按钮可将URL从

https://www.carrefouruae.com/mafuae/en/v4/search?keyword=still%20water

https://www.carrefouruae.com/mafuae/en/v4/search?currentPage=1&filter=&keyword=still%20water&pageSize=60&sortBy=relevance

剩下的就是获取页面,将产品添加到数组中,直到没有产品数据返回。

const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
let fsp = fs.promises;

(async () => {

    async function getProducts(keyword, page = 0) {
        try {
            let all = [];
            let url = `https://www.carrefouruae.com/mafuae/en/v4/search?currentPage=${page}&filter=&keyword=${encodeURI(keyword)}&pageSize=60&sortBy=relevance`

            const response = await axios.get(url);
            const $ = cheerio.load(response.data);

            let raw = $('script#__NEXT_DATA__').text();
            let parsed = JSON.parse(raw);
            let products = parsed.props.initialState.search.products; // take a look here to see other fields that you can get, add them to the return below.

            products = products.map( x => {
                return {
                    name : x.name,
                    brand : x.brand,
                    size : x.size,
                    price : x.originalPrice,
                    currency : x.currency,
                    discount : x.discount,
                    url : x.url,
                    image : x.image.href,
                    otherImages : x.imageLibrary.map(el => el.href),
                    min : x.min,
                    max : x.max,  
                    origin : x.productOrigin,                 
                    supplier : x.supplier                    
                }
            });

            if (products.length != 0) { // page still returning products continue
                all.push(products);
                all = [...all, ...(await getProducts(keyword, page+1))].flat(); 
            } 

            // write everything to JSON file, 
            const data = JSON.stringify(all, null, 2); 
            await fsp.writeFile('searchResults.json', data);

            return all;
            
        } catch (error) {
            console.error(error);
        }
    }

let result = await getProducts('still water');

console.log(result);
console.log(result.length); // check last product manually as the site displays a wrong product count.

})().catch(err => console.error(err));

相关问题