edu.uci.ics.crawler4j.url.WebURL.setDocid()方法的使用及代码示例

x33g5p2x  于2022-02-03 转载在 其他  
字(3.6k)|赞(0)|评价(0)|浏览(110)

本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.setDocid()方法的一些代码示例,展示了WebURL.setDocid()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.setDocid()方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:setDocid

WebURL.setDocid介绍

暂无

代码示例

代码示例来源:origin: yasserg/crawler4j

webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {

代码示例来源:origin: yasserg/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源:origin: yasserg/crawler4j

webURL.setParentUrl(curURL.getParentUrl());
  webURL.setDepth(curURL.getDepth());
  webURL.setDocid(-1);
  webURL.setAnchor(curURL.getAnchor());
  if (shouldVisit(page, webURL)) {
    if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
      webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
      frontier.schedule(webURL);
    } else {
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
    webURL.setDocid(newdocid);
  } else {
    webURL.setDocid(-1);
    webURL.setDepth((short) (curURL.getDepth() + 1));
    if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
      if (shouldVisit(page, webURL)) {
        if (robotstxtServer.allows(webURL)) {
          webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
          toSchedule.add(webURL);
        } else {

代码示例来源:origin: stackoverflow.com

public void addSeed(String pageUrl, int docId) {
   String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
   if (canonicalUrl == null) {
     logger.error("Invalid seed URL: " + pageUrl);
     return;
   }
   if (docId < 0) {
     docId = docIdServer.getDocId(canonicalUrl);
     if (docId > 0) {
       // This URL is already seen.
       return;
     }
     docId = docIdServer.getNewDocID(canonicalUrl);
   } else {
     try {
       docIdServer.addUrlAndDocId(canonicalUrl, docId);
     } catch (Exception e) {
       logger.error("Could not add seed: " + e.getMessage());
     }
   }
   WebURL webUrl = new WebURL();
   webUrl.setURL(canonicalUrl);
   webUrl.setDocid(docId);
   webUrl.setDepth((short) 0);
   if (!robotstxtServer.allows(webUrl)) {
     logger.info("Robots.txt does not allow this seed: " + pageUrl);
   } else {
     frontier.schedule(webUrl); //method that adds URL to the frontier at run time
   }
 }

代码示例来源:origin: edu.uci.ics/crawler4j

webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {

代码示例来源:origin: edu.uci.ics/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源:origin: edu.uci.ics/crawler4j

webURL.setParentUrl(curURL.getParentUrl());
  webURL.setDepth(curURL.getDepth());
  webURL.setDocid(-1);
  webURL.setAnchor(curURL.getAnchor());
  if (shouldVisit(page, webURL)) {
    if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
      webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
      frontier.schedule(webURL);
    } else {
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
    webURL.setDocid(newdocid);
  } else {
    webURL.setDocid(-1);
    webURL.setDepth((short) (curURL.getDepth() + 1));
    if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
      if (shouldVisit(page, webURL)) {
        if (robotstxtServer.allows(webURL)) {
          webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
          toSchedule.add(webURL);
        } else {

相关文章