本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.getURL()
方法的一些代码示例,展示了WebURL.getURL()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.getURL()
方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:getURL
暂无
代码示例来源:origin: yasserg/crawler4j
/**
* Emitted when the crawler is redirected to an invalid Location.
* @param page
*/
protected void onRedirectedToInvalidUrl(Page page) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
page.url.getURL());
}
代码示例来源:origin: yasserg/crawler4j
/**
* This function is called if there has been an error in parsing the content.
*
* @param webUrl URL which failed on parsing
*/
@Deprecated
protected void onParseError(WebURL webUrl) {
logger.warn("Parsing error of: {}", webUrl.getURL());
// Do nothing by default (Except logging)
// Sub-classed can override this to add their custom functionality
}
代码示例来源:origin: yasserg/crawler4j
/**
* This function is called if the content of a url could not be fetched.
*
* @param webUrl URL which content failed to be fetched
*
* @deprecated use {@link #onContentFetchError(Page)}
*/
@Deprecated
protected void onContentFetchError(WebURL webUrl) {
logger.warn("Can't fetch content of: {}", webUrl.getURL());
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}
代码示例来源:origin: yasserg/crawler4j
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if ((o == null) || (getClass() != o.getClass())) {
return false;
}
WebURL otherUrl = (WebURL) o;
return (url != null) && url.equals(otherUrl.getURL());
}
代码示例来源:origin: yasserg/crawler4j
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches();
}
代码示例来源:origin: yasserg/crawler4j
/**
* This function is called if the content of a url could not be fetched.
*
* @param page Partial page object
*/
protected void onContentFetchError(Page page) {
logger.warn("Can't fetch content of: {}", page.getWebURL().getURL());
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}
代码示例来源:origin: yasserg/crawler4j
/**
* This function is called when a unhandled exception was encountered during fetching
*
* @param webUrl URL where a unhandled exception occured
*/
protected void onUnhandledException(WebURL webUrl, Throwable e) {
if (myController.getConfig().isHaltOnError() && !(e instanceof IOException)) {
throw new RuntimeException("unhandled exception", e);
} else {
String urlStr = (webUrl == null ? "NULL" : webUrl.getURL());
logger.warn("Unhandled exception while fetching {}: {}", urlStr, e.getMessage());
logger.info("Stacktrace: ", e);
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}
}
代码示例来源:origin: yasserg/crawler4j
public void setProcessed(WebURL webURL) {
counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES);
if (inProcessPages != null) {
if (!inProcessPages.removeURL(webURL)) {
logger.warn("Could not remove: {} from list of processed pages.", webURL.getURL());
}
}
}
代码示例来源:origin: yasserg/crawler4j
@Override
public void store(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
try {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
insertKeyStatement.setString(1, htmlParseData.getHtml());
insertKeyStatement.setString(2, htmlParseData.getText());
insertKeyStatement.setString(3, page.getWebURL().getURL());
insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
insertKeyStatement.executeUpdate();
} catch (SQLException e) {
logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
throw new RuntimeException(e);
}
}
}
代码示例来源:origin: yasserg/crawler4j
URL url = new URL(webURL.getURL());
String host = getHost(url);
String path = url.getPath();
logger.error("Bad URL in Robots.txt: " + webURL.getURL(), e);
logger.warn("RobotstxtServer: default: allow", webURL.getURL());
return true;
代码示例来源:origin: yasserg/crawler4j
public boolean fetchContent(Page page, int maxBytes) throws SocketTimeoutException, IOException {
try {
page.setFetchResponseHeaders(responseHeaders);
page.load(entity, maxBytes);
return true;
} catch (SocketTimeoutException e) {
throw e;
} catch (IOException | RuntimeException e) {
if (haltOnError) {
throw e;
} else {
logger.info("Exception while fetching content for: {} [{}]", page.getWebURL().getURL(),
e.getMessage());
}
}
return false;
}
代码示例来源:origin: yasserg/crawler4j
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
logger.info("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.info("Text length: " + text.length());
logger.info("Html length: " + html.length());
logger.info("Number of outgoing links: " + links.size());
try {
postgresDBService.store(page);
} catch (RuntimeException e) {
logger.error("Storing failed", e);
}
}
}
代码示例来源:origin: yasserg/crawler4j
private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
final String pagePath = referringPage.getPath();
final String pageUrl = referringPage.getURL();
Set<WebURL> outgoingUrls = new HashSet<>();
for (String url : extractedUrls) {
String relative = getLinkRelativeTo(pagePath, url);
String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
WebURL webURL = new WebURL();
webURL.setURL(absolute);
outgoingUrls.add(webURL);
}
return outgoingUrls;
}
代码示例来源:origin: yasserg/crawler4j
String toFetchURL = webUrl.getURL();
HttpUriRequest request = null;
try {
代码示例来源:origin: yasserg/crawler4j
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException("could not parse [" + page.getWebURL().getURL() + "]", e);
logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
throw new ParseException("could not parse [" + page.getWebURL().getURL() + "]", e);
代码示例来源:origin: yasserg/crawler4j
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" policy",
webURL.getURL());
webURL.getURL());
fetchResult.getEntity().getContentType() == null ? "" :
fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(),
contentType, description);
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
"Warning: unknown page size exceeded max-download-size, truncated to: " +
"({}), at URL: {}",
myController.getConfig().getMaxDownloadSize(), curURL.getURL());
parser.parse(page, curURL.getURL());
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
代码示例来源:origin: yasserg/crawler4j
@Override
public void objectToEntry(WebURL url, TupleOutput output) {
output.writeString(url.getURL());
output.writeInt(url.getDocid());
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
}
}
代码示例来源:origin: yasserg/crawler4j
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing css: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException(e);
代码示例来源:origin: stackoverflow.com
public boolean shouldVisit(Page page, WebURL url) {
String href = url.getURL().toLowerCase();
// prefixes that you want to crawl
String allowedPrefixes[] = {"http://url1.com", "http://url2.com"};
for (String allowedPrefix : allowedPrefixes) {
if (href.startsWith(allowedPrefix)) {
return true;
}
}
return false;
}
代码示例来源:origin: biezhi/java-library-examples
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}
内容来源于网络,如有侵权,请联系作者删除!