NodeJS 使用节点js进行网页剪贴[已关闭]

cu6pst1q 于 2022-12-26 发布在 Node.js

关注(0)|答案(2)|浏览(119)

已关闭。此问题需要超过focused。当前不接受答案。
**想要改进此问题吗？**更新此问题，使其仅关注editing this post的一个问题。

2天前关闭。
Improve this question
我需要先显示整个webiste的图像，我已经得到了在网站上的所有锚标签的hrefs，并试图加载标签的图像，但由于一些循环错误而失败。结果是显示，但不根据网站中的图像

loginrouter.get('/images',(req,res)=>{
  var WriteStream  = fs.createWriteStream("ImagesLink.txt", "UTF-8");
  request('https://nu.edu.pk/', (err, resp, html)=>{
  
      if(!err && resp.statusCode == 200){
          console.log("Request was success ");
          
          const $ = cherio.load(html);
          
  
          $("a").each((index, datalinks)=>{
            var Links=[];
            
              var anch = $(datalinks).attr('href');
              var baseUrl = 'https://nu.edu.pk';
               Links= baseUrl+ anch;

             Array.of(Links).forEach(Links => {
              request(Links, (err, resp, html1)=>{
               console.log("Links areeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/////////////////////////",Links)
                $("img").each((index2, imglinks) => {
                        var img2 = $(imglinks).attr('src');
                        var url='https://nu.edu.pk';
                        const Links2=url+  img2;
                        console.log(Links2)
                   })
              })
              });
            
              });
      }
      else{
          console.log("Request Failed ");
      }
  });
  
  })

Link here is representing the anchor tag hrefs and the simple links are displaying the image links but the image links are incomplete and are not displaying the main images of the href pages

node.js

来源：https://stackoverflow.com/questions/74891415/webscrapping-using-node-js

2条答案

按热度按时间

wtzytmuj1#

对于抓取，我认为最好使用无头浏览器，如puppeteer。一些网站阻止 AJAX 请求。
下面是一个如何使用 puppet 师做你想要的例子。
1.获取所有a-tag和相应的href
1.转到您检索到的每个链接，并获得所有的图像链接

const puppeteer = require("puppeteer");

(async () => {
let browser;

async function initialisePage(link) {
    const page = await browser.newPage();
    // page.setDefaultNavigationTimeout(10000);
    // page.setDefaultTimeout(10000);
    await page.setExtraHTTPHeaders({
        'Accept-Language': 'en'
    });
    await page.setGeolocation({
        latitude: 40.75073264981463,
        longitude: -73.9859968851446
    });
    await page.setViewport({ width: 1920, height: 1080});
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36');
    await page.goto(link);

    await page.evaluateOnNewDocument(() => {
        Object.defineProperty(navigator, "language", {
            get: function() {
                return "en";
            }
        });
        Object.defineProperty(navigator, "languages", {
            get: function() {
                return ["en"];
            }
        });
    });

    const viewPortHeight = await page.evaluate(() => {
        return window.document.documentElement.scrollHeight;
    });

    await page.setViewport({
        width: 1920,
        height: Math.round(viewPortHeight)
    });

    try {
        await page.waitForNetworkIdle(); // wait till all requests have been finished, sometimes websites might fetch data using fetch, ajax and what not
    }
    catch (e) {
        console.error(e);
    }
    //accept cookies
    await page.evaluate(_ => { // script to try to accept cookies, maybe this accept cookies blocks the content
        const iframes = [...document.getElementsByTagName("iframe")];
        for (let iframe of iframes) {
            iframe.remove();
        }
        function xcc_contains(selector, text) {
            const elements = document.querySelectorAll(selector);
            return Array.prototype.filter.call(elements, function(element){
                return RegExp(text, "i").test(element.textContent.trim());
            });
        }
        const _xcc = xcc_contains('a, button', '^(Accept All|Accept|I understand|Agree|Okay|Agree & continue|OK)$');
        if (_xcc != null && _xcc.length !== 0) { _xcc[0].click(); }
    });

    /*try { // remove this block if you are sure there are no redirects after accepting cookies
        await page.waitForNavigation({
            timeout: 1000 // only wait 1 seconds because it might not redirect you
        }); // because the accept cookies click might redirect you
    }
    catch (e) {
        console.error("wait nav error", e);
    }*/

    return page;
}

try {
    browser = await puppeteer.launch({
        headless: true,
        args: ['--lang=en', '--accept-lang=en', '--no-sandbox']
    });

    const page = await initialisePage('https://nu.edu.pk/');

    const links = await page.evaluate(() => { // get all hrefs from all a tags
        return [...new Set([...document.getElementsByTagName("a")].filter(value => value.href).map(value => value.href))]; // new set because you have to make sure it's unique
    });

    let images = [];

    console.log("Getting images for ", links.length, " links...");

    for (let link of links) {
        console.log("Getting images for ", link);
        try {
            const page = await initialisePage(link);
            const imageLinks = await page.evaluate(() => { // get all image links for the current a tag
                return [...document.getElementsByTagName("img")].filter(value => value.src).map(value => value.src);
            });
            images.push(...imageLinks);
            console.log(imageLinks);

            await page.close();
        }
        catch (e) {
            console.error(e);
        }

    }

    images = new Set(images); // make the imageLinks unique
    console.log(images)
    await page.close();
}
catch (e) {
    console.error(e);
    throw e;
}
finally {
    browser && await browser.close();
}
})();

赞(0）回复(0）举报 2022-12-26

igetnqfo2#

下面的代码可以正常工作，但存在一些问题
1.硬编码.jpg扩展名
1.它不处理循环链接。
1.它将在无效链路上崩溃
1.你将需要异步等待或承诺代码。
1.您必须添加额外的逻辑来清理URL
如果你运行下面的代码，它会在一段时间后中断。

const cheerio = require('cheerio');
const request = require('request');
const fs = require('fs');

const url = 'https://nu.edu.pk/';

async function downloadImages(url) {
  request(url, (error, response, html) => {
    if (!error && response.statusCode == 200) {
      const $ = cheerio.load(html);

      const images = $('img');

      images.each((i, image) => {
        const src = $(image).attr('src');
        console.log({ src })
        const fileName = `IMAGE_NAME-${i}.jpg`;
        const imgPath = `${url}${src}`;
        console.log({ imgPath })
        request(imgPath).pipe(fs.createWriteStream(fileName));
      });

      const links = $('a');

      links.each((i, link) => {
        const href = $(link).attr('href');
        downloadImages(href);
      });
    }
  });
}

downloadImages(url);

赞(0）回复(0）举报 2022-12-26

我来回答

NodeJS 使用节点js进行网页剪贴[已关闭]

2条答案

相关问题

热门标签

最新问答