本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.setURL()
方法的一些代码示例,展示了WebURL.setURL()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.setURL()
方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:setURL
暂无
代码示例来源:origin: yasserg/crawler4j
private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
final String pagePath = referringPage.getPath();
final String pageUrl = referringPage.getURL();
Set<WebURL> outgoingUrls = new HashSet<>();
for (String url : extractedUrls) {
String relative = getLinkRelativeTo(pagePath, url);
String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
WebURL webURL = new WebURL();
webURL.setURL(absolute);
outgoingUrls.add(webURL);
}
return outgoingUrls;
}
代码示例来源:origin: yasserg/crawler4j
WebURL webURL = new WebURL();
webURL.setTldList(tldList);
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
代码示例来源:origin: yasserg/crawler4j
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
代码示例来源:origin: yasserg/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: yasserg/crawler4j
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
return;
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
代码示例来源:origin: edu.uci.ics/crawler4j
public static Set<WebURL> extractUrls(String input) {
Set<WebURL> extractedUrls = new HashSet<>();
if (input != null) {
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
WebURL webURL = new WebURL();
String urlStr = matcher.group();
if (!urlStr.startsWith("http")) {
urlStr = "http://" + urlStr;
}
webURL.setURL(urlStr);
extractedUrls.add(webURL);
}
}
return extractedUrls;
}
代码示例来源:origin: tim232385/WebVideoBot
@Override
protected WebURL handleUrlBeforeProcess(WebURL webURL) {
return getViewkey(webURL)
.map(key -> "https://www.pornhub.com/embed/" + key)
.map(url -> {
WebURL newUrl = new WebURL();
newUrl.setURL(url);
return newUrl;
}).orElse(super.handleUrlBeforeProcess(webURL));
}
代码示例来源:origin: tim232385/WebVideoBot
public void download(CrawlConfig config, String url, File file) throws InterruptedException, IOException {
PageFetcher pageFetcher = new PageFetcher(config);
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchPage(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
logger.info("Start download filePath:[{}]", file);
FileUtils.copyInputStreamToFile(fetchResult.getEntity().getContent(), file);
logger.info("Download Finish filePath:[{}].", file);
} else {
logger.info("Skip download url:[{}], HttpStatus:[{}]", url, fetchResult.getStatusCode());
}
} catch (PageBiggerThanMaxSizeException e) {
logger.debug("PageBiggerThanMaxSizeException", e);
logger.info("Skip download url:[{}], Out of MaxDownloadSize", url);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
代码示例来源:origin: stackoverflow.com
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
} else {
frontier.schedule(webUrl); //method that adds URL to the frontier at run time
}
}
代码示例来源:origin: edu.uci.ics/crawler4j
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
代码示例来源:origin: edu.uci.ics/crawler4j
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
代码示例来源:origin: biezhi/java-library-examples
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchPage(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(curURL);
fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
parser.parse(page, curURL.getURL());
return page;
}
} catch (Exception e) {
logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
}
代码示例来源:origin: tjake/stormscraper
curURL.setURL(URLCanonicalizer.getCanonicalURL(currentUrl));
baseURL.setURL(URLCanonicalizer.getCanonicalURL(startUrl));
代码示例来源:origin: edu.uci.ics/crawler4j
@Override
public WebURL entryToObject(TupleInput input) {
WebURL webURL = new WebURL();
webURL.setURL(input.readString());
webURL.setDocid(input.readInt());
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
return webURL;
}
代码示例来源:origin: edu.uci.ics/crawler4j
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
return;
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
代码示例来源:origin: biezhi/java-library-examples
WebURL url = new WebURL();
url.setURL("http://localhost:8080/some/index.html");
PageFetcher pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 47);
WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
url.setURL("http://localhost:8080/some/invoice.pdf");
pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 4);
内容来源于网络,如有侵权,请联系作者删除!