本文整理了Java中edu.uci.ics.crawler4j.url.WebURL
类的一些代码示例,展示了WebURL
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL
类的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
暂无
代码示例来源:origin: yasserg/crawler4j
/**
* Emitted when the crawler is redirected to an invalid Location.
* @param page
*/
protected void onRedirectedToInvalidUrl(Page page) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
page.url.getURL());
}
代码示例来源:origin: yasserg/crawler4j
WebURL webURL = new WebURL();
webURL.setTldList(myController.getTldList());
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" policy",
webURL.getURL());
webURL.getURL());
fetchResult.getEntity().getContentType() == null ? "" :
fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(),
contentType, description);
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
代码示例来源:origin: yasserg/crawler4j
private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
final String pagePath = referringPage.getPath();
final String pageUrl = referringPage.getURL();
Set<WebURL> outgoingUrls = new HashSet<>();
for (String url : extractedUrls) {
String relative = getLinkRelativeTo(pagePath, url);
String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
WebURL webURL = new WebURL();
webURL.setURL(absolute);
outgoingUrls.add(webURL);
}
return outgoingUrls;
}
代码示例来源:origin: yasserg/crawler4j
protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
byte[] keyData = new byte[6];
keyData[0] = url.getPriority();
keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
Util.putIntInByteArray(url.getDocid(), keyData, 2);
return new DatabaseEntry(keyData);
}
代码示例来源:origin: yasserg/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: yasserg/crawler4j
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
}
}
代码示例来源:origin: yasserg/crawler4j
WebURL webUrl = new WebURL();
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {
frontier.schedule(webUrl);
代码示例来源:origin: biezhi/java-library-examples
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
代码示例来源:origin: biezhi/java-library-examples
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchPage(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(curURL);
fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
parser.parse(page, curURL.getURL());
return page;
}
} catch (Exception e) {
logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
public static Set<WebURL> extractUrls(String input) {
Set<WebURL> extractedUrls = new HashSet<>();
if (input != null) {
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
WebURL webURL = new WebURL();
String urlStr = matcher.group();
if (!urlStr.startsWith("http")) {
urlStr = "http://" + urlStr;
}
webURL.setURL(urlStr);
extractedUrls.add(webURL);
}
}
return extractedUrls;
}
代码示例来源:origin: yasserg/crawler4j
String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setTldList(tldList);
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
webURL.setAttributes(urlAnchorPair.getAttributes());
outgoingUrls.add(webURL);
urlCount++;
代码示例来源:origin: biezhi/java-library-examples
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
}
代码示例来源:origin: biezhi/java-library-examples
@Override
protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
if (statusCode != HttpStatus.SC_OK) {
if (statusCode == HttpStatus.SC_NOT_FOUND) {
logger.warn("Broken link: {}, this link was found in page: {}", webUrl.getURL(),
webUrl.getParentUrl());
} else {
logger.warn("Non success status for link: {} status code: {}, description: ",
webUrl.getURL(), statusCode, statusDescription);
}
}
}
}
代码示例来源:origin: tim232385/WebVideoBot
public String getEmbedKey(WebURL webURL) {
final Pattern EMBED_PATTERN = Pattern.compile("(\\/embed\\/)(.*)");
if(!EMBED_PATTERN.matcher(webURL.getPath()).matches()){
return "";
} else {
return EMBED_PATTERN.matcher(webURL.getPath()).replaceAll("$2");
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
}
}
代码示例来源:origin: stackoverflow.com
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
} else {
frontier.schedule(webUrl); //method that adds URL to the frontier at run time
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: tim232385/WebVideoBot
@Override
protected WebURL handleUrlBeforeProcess(WebURL webURL) {
return getViewkey(webURL)
.map(key -> "https://www.pornhub.com/embed/" + key)
.map(url -> {
WebURL newUrl = new WebURL();
newUrl.setURL(url);
return newUrl;
}).orElse(super.handleUrlBeforeProcess(webURL));
}
代码示例来源:origin: edu.uci.ics/crawler4j
String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
webURL.setAttributes(urlAnchorPair.getAttributes());
outgoingUrls.add(webURL);
urlCount++;
代码示例来源:origin: biezhi/java-library-examples
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
}
内容来源于网络,如有侵权,请联系作者删除!