edu.uci.ics.crawler4j.url.WebURL.getDocid()方法的使用及代码示例

x33g5p2x  于2022-02-03 转载在 其他  
字(3.9k)|赞(0)|评价(0)|浏览(110)

本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.getDocid()方法的一些代码示例,展示了WebURL.getDocid()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.getDocid()方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:getDocid

WebURL.getDocid介绍

暂无

代码示例

代码示例来源:origin: yasserg/crawler4j

  1. protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
  2. byte[] keyData = new byte[6];
  3. keyData[0] = url.getPriority();
  4. keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
  5. Util.putIntInByteArray(url.getDocid(), keyData, 2);
  6. return new DatabaseEntry(keyData);
  7. }

代码示例来源:origin: yasserg/crawler4j

  1. @Override
  2. public void objectToEntry(WebURL url, TupleOutput output) {
  3. output.writeString(url.getURL());
  4. output.writeInt(url.getDocid());
  5. output.writeInt(url.getParentDocid());
  6. output.writeString(url.getParentUrl());
  7. output.writeShort(url.getDepth());
  8. output.writeByte(url.getPriority());
  9. output.writeString(url.getAnchor());
  10. }
  11. }

代码示例来源:origin: yasserg/crawler4j

  1. int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
  2. for (WebURL webURL : parseData.getOutgoingUrls()) {
  3. webURL.setParentDocid(curURL.getDocid());
  4. webURL.setParentUrl(curURL.getURL());
  5. int newdocid = docIdServer.getDocId(webURL.getURL());

代码示例来源:origin: biezhi/java-library-examples

  1. @Override
  2. public void visit(Page page) {
  3. int docid = page.getWebURL().getDocid();
  4. String url = page.getWebURL().getURL();
  5. int parentDocid = page.getWebURL().getParentDocid();
  6. logger.debug("Docid: {}", docid);
  7. logger.info("URL: {}", url);
  8. logger.debug("Docid of parent page: {}", parentDocid);
  9. if (page.getParseData() instanceof HtmlParseData) {
  10. HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
  11. String text = htmlParseData.getText();
  12. String html = htmlParseData.getHtml();
  13. Set<WebURL> links = htmlParseData.getOutgoingUrls();
  14. logger.debug("Text length: {}", text.length());
  15. logger.debug("Html length: {}", html.length());
  16. logger.debug("Number of outgoing links: {}", links.size());
  17. }
  18. logger.debug("=============");
  19. }
  20. }

代码示例来源:origin: biezhi/java-library-examples

  1. @Override
  2. public void visit(Page page) {
  3. int docid = page.getWebURL().getDocid();
  4. String url = page.getWebURL().getURL();
  5. int parentDocid = page.getWebURL().getParentDocid();
  6. logger.debug("Docid: {}", docid);
  7. logger.info("URL: {}", url);
  8. logger.debug("Docid of parent page: {}", parentDocid);
  9. if (page.getParseData() instanceof HtmlParseData) {
  10. HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
  11. String text = htmlParseData.getText();
  12. String html = htmlParseData.getHtml();
  13. Set<WebURL> links = htmlParseData.getOutgoingUrls();
  14. logger.debug("Text length: {}", text.length());
  15. logger.debug("Html length: {}", html.length());
  16. logger.debug("Number of outgoing links: {}", links.size());
  17. }
  18. logger.debug("=============");
  19. }
  20. }

代码示例来源:origin: biezhi/java-library-examples

  1. int docid = page.getWebURL().getDocid();
  2. String url = page.getWebURL().getURL();
  3. String domain = page.getWebURL().getDomain();

代码示例来源:origin: edu.uci.ics/crawler4j

  1. protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
  2. byte[] keyData = new byte[6];
  3. keyData[0] = url.getPriority();
  4. keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
  5. Util.putIntInByteArray(url.getDocid(), keyData, 2);
  6. return new DatabaseEntry(keyData);
  7. }

代码示例来源:origin: edu.uci.ics/crawler4j

  1. @Override
  2. public void objectToEntry(WebURL url, TupleOutput output) {
  3. output.writeString(url.getURL());
  4. output.writeInt(url.getDocid());
  5. output.writeInt(url.getParentDocid());
  6. output.writeString(url.getParentUrl());
  7. output.writeShort(url.getDepth());
  8. output.writeByte(url.getPriority());
  9. output.writeString(url.getAnchor());
  10. }
  11. }

代码示例来源:origin: edu.uci.ics/crawler4j

  1. int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
  2. for (WebURL webURL : parseData.getOutgoingUrls()) {
  3. webURL.setParentDocid(curURL.getDocid());
  4. webURL.setParentUrl(curURL.getURL());
  5. int newdocid = docIdServer.getDocId(webURL.getURL());

相关文章