本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.getDocid()
方法的一些代码示例,展示了WebURL.getDocid()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.getDocid()
方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:getDocid
暂无
代码示例来源:origin: yasserg/crawler4j
protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
byte[] keyData = new byte[6];
keyData[0] = url.getPriority();
keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
Util.putIntInByteArray(url.getDocid(), keyData, 2);
return new DatabaseEntry(keyData);
}
代码示例来源:origin: yasserg/crawler4j
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
}
}
代码示例来源:origin: yasserg/crawler4j
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
代码示例来源:origin: biezhi/java-library-examples
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
}
代码示例来源:origin: biezhi/java-library-examples
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
}
代码示例来源:origin: biezhi/java-library-examples
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
代码示例来源:origin: edu.uci.ics/crawler4j
protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
byte[] keyData = new byte[6];
keyData[0] = url.getPriority();
keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
Util.putIntInByteArray(url.getDocid(), keyData, 2);
return new DatabaseEntry(keyData);
}
代码示例来源:origin: edu.uci.ics/crawler4j
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
内容来源于网络,如有侵权,请联系作者删除!