本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.<init>()方法的一些代码示例，展示了WebURL.<init>()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台，是从一些精选项目中提取出来的代码，具有较强的参考意义，能在一定程度帮忙到你。WebURL.<init>()方法的具体详情如下：
包路径：edu.uci.ics.crawler4j.url.WebURL
类名称：WebURL
方法名：<init>

WebURL.<init>介绍

暂无

代码示例

代码示例来源：origin: yasserg/crawler4j

private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
  Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
  final String pagePath = referringPage.getPath();
  final String pageUrl = referringPage.getURL();
  Set<WebURL> outgoingUrls = new HashSet<>();
  for (String url : extractedUrls) {
    String relative = getLinkRelativeTo(pagePath, url);
    String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
    WebURL webURL = new WebURL();
    webURL.setURL(absolute);
    outgoingUrls.add(webURL);
  }
  return outgoingUrls;
}

代码示例来源：origin: yasserg/crawler4j

String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
  WebURL webURL = new WebURL();
  webURL.setTldList(tldList);
  webURL.setURL(url);

代码示例来源：origin: yasserg/crawler4j

WebURL webUrl = new WebURL();
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);

代码示例来源：origin: yasserg/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源：origin: yasserg/crawler4j

WebURL webURL = new WebURL();
webURL.setTldList(myController.getTldList());
webURL.setURL(movedToUrl);

代码示例来源：origin: edu.uci.ics/crawler4j

public static Set<WebURL> extractUrls(String input) {
  Set<WebURL> extractedUrls = new HashSet<>();
  if (input != null) {
    Matcher matcher = pattern.matcher(input);
    while (matcher.find()) {
      WebURL webURL = new WebURL();
      String urlStr = matcher.group();
      if (!urlStr.startsWith("http")) {
        urlStr = "http://" + urlStr;
      }
      webURL.setURL(urlStr);
      extractedUrls.add(webURL);
    }
  }
  return extractedUrls;
}

代码示例来源：origin: tim232385/WebVideoBot

@Override
protected WebURL handleUrlBeforeProcess(WebURL webURL) {
  return getViewkey(webURL)
      .map(key -> "https://www.pornhub.com/embed/" + key)
      .map(url -> {
        WebURL newUrl = new WebURL();
        newUrl.setURL(url);
        return newUrl;
      }).orElse(super.handleUrlBeforeProcess(webURL));
}

代码示例来源：origin: tim232385/WebVideoBot

public void download(CrawlConfig config, String url, File file) throws InterruptedException, IOException {
  PageFetcher pageFetcher = new PageFetcher(config);
  WebURL curURL = new WebURL();
  curURL.setURL(url);
  PageFetchResult fetchResult = null;
  try {
    fetchResult = pageFetcher.fetchPage(curURL);
    if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
      logger.info("Start download filePath:[{}]", file);
      FileUtils.copyInputStreamToFile(fetchResult.getEntity().getContent(), file);
      logger.info("Download Finish filePath:[{}].", file);
    } else {
      logger.info("Skip download url:[{}], HttpStatus:[{}]", url, fetchResult.getStatusCode());
    }
  } catch (PageBiggerThanMaxSizeException e) {
    logger.debug("PageBiggerThanMaxSizeException", e);
    logger.info("Skip download url:[{}], Out of  MaxDownloadSize", url);
  } finally {
    if (fetchResult != null) {
      fetchResult.discardContentIfNotConsumed();
    }
  }
}

代码示例来源：origin: stackoverflow.com

public void addSeed(String pageUrl, int docId) {
   String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
   if (canonicalUrl == null) {
     logger.error("Invalid seed URL: " + pageUrl);
     return;
   }
   if (docId < 0) {
     docId = docIdServer.getDocId(canonicalUrl);
     if (docId > 0) {
       // This URL is already seen.
       return;
     }
     docId = docIdServer.getNewDocID(canonicalUrl);
   } else {
     try {
       docIdServer.addUrlAndDocId(canonicalUrl, docId);
     } catch (Exception e) {
       logger.error("Could not add seed: " + e.getMessage());
     }
   }
   WebURL webUrl = new WebURL();
   webUrl.setURL(canonicalUrl);
   webUrl.setDocid(docId);
   webUrl.setDepth((short) 0);
   if (!robotstxtServer.allows(webUrl)) {
     logger.info("Robots.txt does not allow this seed: " + pageUrl);
   } else {
     frontier.schedule(webUrl); //method that adds URL to the frontier at run time
   }
 }

代码示例来源：origin: edu.uci.ics/crawler4j

String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
  WebURL webURL = new WebURL();
  webURL.setURL(url);
  webURL.setTag(urlAnchorPair.getTag());

代码示例来源：origin: edu.uci.ics/crawler4j

WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);

代码示例来源：origin: biezhi/java-library-examples

private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchPage(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(curURL);
        fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
        parser.parse(page, curURL.getURL());
        return page;
      }
    } catch (Exception e) {
      logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
    return null;
  }
}

代码示例来源：origin: tjake/stormscraper

WebURL curURL = new WebURL();
curURL.setURL(URLCanonicalizer.getCanonicalURL(currentUrl));
WebURL baseURL = new WebURL();
baseURL.setURL(URLCanonicalizer.getCanonicalURL(startUrl));

代码示例来源：origin: edu.uci.ics/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源：origin: edu.uci.ics/crawler4j

WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());

代码示例来源：origin: biezhi/java-library-examples

WebURL url = new WebURL();

edu.uci.ics.crawler4j.url.WebURL.<init>()方法的使用及代码示例

WebURL.<init>介绍

代码示例

相关文章

热门标签

最新文章

WebURL类方法