本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.setDocid()
方法的一些代码示例,展示了WebURL.setDocid()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.setDocid()
方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:setDocid
暂无
代码示例来源:origin: yasserg/crawler4j
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {
代码示例来源:origin: yasserg/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: yasserg/crawler4j
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
代码示例来源:origin: stackoverflow.com
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
} else {
frontier.schedule(webUrl); //method that adds URL to the frontier at run time
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {
代码示例来源:origin: edu.uci.ics/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: edu.uci.ics/crawler4j
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
内容来源于网络,如有侵权,请联系作者删除!