本文整理了Java中org.jsoup.nodes.Document
类的一些代码示例,展示了Document
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Document
类的具体详情如下:
包路径:org.jsoup.nodes.Document
类名称:Document
[英]A HTML Document.
[中]HTML文档。
代码示例来源:origin: RipMeApp/ripme
@Override
public Document getNextPage(Document page) throws IOException {
Elements nextPageLink = page.select("li.page_next > a");
if (nextPageLink.isEmpty()){
throw new IOException("No more pages");
} else {
URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
return Http.url(nextURL).get();
}
}
代码示例来源:origin: JpressProjects/jpress
/**
* 让html的图片变成绝对路径,这在api请求文章数据的时候,方便客户端直接浏览
*
* @param html
* @param domain
* @return
*/
public static String makeImageSrcToAbsolutePath(String html, String domain) {
if (StrUtils.isBlank(domain)) {
return html;
}
Document doc = Jsoup.parse(html);
Elements es = doc.select("img");
if (es != null && es.size() > 0) {
for (Element e : es) {
String src = e.attr("src");
if (StrUtils.isNotBlank(src) && src.startsWith("/")) {
src = domain + src;
e.attr("src", src);
}
}
}
return doc.body().children().toString();
}
代码示例来源:origin: org.jsoup/jsoup
private void normaliseStructure(String tag, Element htmlEl) {
Elements elements = this.getElementsByTag(tag);
Element master = elements.first(); // will always be available as created above if not existent
if (elements.size() > 1) { // dupes, move contents to master
List<Node> toMove = new ArrayList<>();
for (int i = 1; i < elements.size(); i++) {
Node dupe = elements.get(i);
toMove.addAll(dupe.ensureChildNodes());
dupe.remove();
}
for (Node dupe : toMove)
master.appendChild(dupe);
}
// ensure parented by <html>
if (!master.parent().equals(htmlEl)) {
htmlEl.appendChild(master); // includes remove()
}
}
代码示例来源:origin: square/retrofit
@Override public Page convert(ResponseBody responseBody) throws IOException {
Document document = Jsoup.parse(responseBody.string());
List<String> links = new ArrayList<>();
for (Element element : document.select("a[href]")) {
links.add(element.attr("href"));
}
return new Page(document.title(), Collections.unmodifiableList(links));
}
}
代码示例来源:origin: org.jsoup/jsoup
/**
Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
@param dirtyDocument Untrusted base document to clean.
@return cleaned document.
*/
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
代码示例来源:origin: RipMeApp/ripme
private List<String> getURLsFromChap(Document doc) {
LOGGER.debug("Getting urls from " + doc.location());
List<String> result = new ArrayList<>();
for (Element el : doc.select(".vung-doc > img")) {
result.add(el.attr("src"));
}
return result;
}
代码示例来源:origin: RipMeApp/ripme
private JSONObject getJSON(String page, String apiKey) {
URL pageURL = null;
String apiURL = null;
try {
apiURL = apiURLBuilder(getPhotosetID(url.toExternalForm()), page, apiKey);
pageURL = new URL(apiURL);
} catch (MalformedURLException e) {
LOGGER.error("Unable to get api link " + apiURL + " is malformed");
}
try {
LOGGER.info(Http.url(pageURL).ignoreContentType().get().text());
return new JSONObject(Http.url(pageURL).ignoreContentType().get().text());
} catch (IOException e) {
LOGGER.error("Unable to get api link " + apiURL + " is malformed");
return null;
}
}
代码示例来源:origin: RipMeApp/ripme
Document doc = Http.url(url).get();
Elements metaTags = doc.getElementsByTag("meta");
if (metaTag.attr("property").equals("og:image")) {
imgsrc = metaTag.attr("content");
LOGGER.info("Found URL " + imgsrc);
break;//only one (useful) image possible for an "image page".
LOGGER.warn("Image not found at " + this.url);
return;
addURLToDownload(new URL(imgsrc), prefix);
} catch (IOException e) {
LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
代码示例来源:origin: RipMeApp/ripme
@Override
public void rip() throws IOException {
LOGGER.info(" Retrieving " + this.url.toExternalForm());
Document doc = Http.url(this.url).get();
Elements videos = doc.select("meta[name=twitter:player:stream]");
if (videos.isEmpty()) {
throw new IOException("Could not find twitter:player:stream at " + url);
}
String vidUrl = videos.first().attr("content");
vidUrl = vidUrl.replaceAll("&", "&");
addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
waitForThreads();
}
}
代码示例来源:origin: RipMeApp/ripme
@Override
public List<String> getURLsFromPage(Document doc) {
List<String> results = new ArrayList<>();
String duckMoviesUrl = doc.select("iframe").attr("src");
try {
Document duckDoc = Http.url(new URL(duckMoviesUrl)).get();
String videoURL = duckDoc.select("source").attr("src");
// remove any white spaces so we can download the movie without a 400 error
videoURL = videoURL.replaceAll(" ", "%20");
results.add(videoURL);
} catch (MalformedURLException e) {
LOGGER.error(duckMoviesUrl + " is not a valid url");
} catch (IOException e) {
LOGGER.error("Unable to load page " + duckMoviesUrl);
e.printStackTrace();
}
return results;
}
代码示例来源:origin: RipMeApp/ripme
private void fetchImage() {
try {
Document doc = Http.url(this.url)
.referrer(this.url)
.get();
// Find image
Elements images = doc.select("#photoImageSection img");
Element image = images.first();
String imgsrc = image.attr("src");
LOGGER.info("Found URL " + imgsrc + " via " + images.get(0));
// Provide prefix and let the AbstractRipper "guess" the filename
String prefix = "";
if (Utils.getConfigBoolean("download.save_order", true)) {
prefix = String.format("%03d_", index);
}
URL imgurl = new URL(url, imgsrc);
addURLToDownload(imgurl, prefix);
} catch (IOException e) {
LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
}
}
}
代码示例来源:origin: RipMeApp/ripme
@Override
public List<String> getURLsFromPage(Document doc) {
LOGGER.debug("Checking for urls");
List<String> result = new ArrayList<>();
if (!isVideoUrl(url)) {
for (Element page : doc.select("div.items > div.item-container > a.item")) {
String pageWithImageUrl = page.attr("href");
try {
String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src");
downloadFile(image);
} catch (IOException e) {
LOGGER.error("Was unable to load page " + pageWithImageUrl);
}
}
} else {
String imgUrl = doc.select("div.player-container > a").attr("href");
downloadFile(imgUrl);
}
return result;
}
代码示例来源:origin: RipMeApp/ripme
private URL getGalleryFromImage(URL url) throws IOException {
Document doc = Http.url(url).get();
for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
LOGGER.info("LINK: " + link.toString());
if (link.hasAttr("href")
&& link.attr("href").contains("gallery.php")) {
url = new URL("http://imagearn.com/" + link.attr("href"));
LOGGER.info("[!] Found gallery from given link: " + url);
return url;
}
}
throw new IOException("Failed to find gallery at URL " + url);
}
代码示例来源:origin: RipMeApp/ripme
@Override
public void rip() throws IOException {
LOGGER.info("Retrieving " + this.url);
Document doc = Http.url(url).get();
//Get user friendly filename from page title
String title = doc.title();
Elements script = doc.select("script");
if (script.isEmpty()) {
throw new IOException("Could not find script code at " + url);
}
//Regex assumes highest quality source is listed first
Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
for (Element element : script) {
Matcher m = p.matcher(element.data());
if (m.find()){
String vidUrl = m.group(1);
addURLToDownload(new URL(vidUrl), HOST + "_" + title);
}
}
waitForThreads();
}
}
代码示例来源:origin: RipMeApp/ripme
@Override
public void rip() throws IOException {
LOGGER.info("Retrieving " + this.url);
Document doc = Http.url(url).get();
Elements videos = doc.select(".wp-video > video > source");
if (videos.isEmpty()) {
throw new IOException("Could not find Embed code at " + url);
}
String vidUrl = videos.attr("src");
addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
waitForThreads();
}
}
代码示例来源:origin: RipMeApp/ripme
@Override
public void rip() throws IOException {
LOGGER.info("Retrieving " + this.url);
Document doc = Http.url(url).get();
List<String> mp4s = Utils.between(doc.html(), "file:\"", "\"");
if (mp4s.isEmpty()) {
throw new IOException("Could not find files at " + url);
}
String vidUrl = mp4s.get(0);
addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
waitForThreads();
}
}
代码示例来源:origin: RipMeApp/ripme
public static List<URL> getURLs(URL url) throws IOException{
Response resp = Http.url(url)
.ignoreContentType()
.response();
Document doc = resp.parse();
List<URL> URLs = new ArrayList<>();
//Pictures
Elements imgs = doc.getElementsByTag("img");
for (Element img : imgs) {
if (img.hasClass("album-image")) {
String imageURL = img.attr("src");
URLs.add(new URL(imageURL));
}
}
//Videos
Elements vids = doc.getElementsByTag("video");
for (Element vid : vids) {
if (vid.hasClass("album-video")) {
Elements source = vid.getElementsByTag("source");
String videoURL = source.first().attr("src");
URLs.add(new URL(videoURL));
}
}
return URLs;
}
}
代码示例来源:origin: loklak/loklak_server
/**
* Article API
* @param URL
* @param JSONObject genericScraperData
* @return genericScraperData
*/
public JSONObject articleAPI (String url, JSONObject genericScraperData) throws MalformedURLException{
URL qurl = new URL(url);
String data = "";
try {
data = null;// ArticleExtractor.INSTANCE.getText(qurl);
genericScraperData.put("query", qurl);
genericScraperData.put("data", data);
genericScraperData.put("NLP", "true");
}
catch (Exception e) {
if ("".equals(data)) {
try {
Document htmlPage = Jsoup.connect(url).get();
data = htmlPage.text();
genericScraperData.put("query", qurl);
genericScraperData.put("data", data);
genericScraperData.put("NLP", "false");
} catch (Exception ex) {}
}
}
return genericScraperData;
}
代码示例来源:origin: RipMeApp/ripme
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
if (!is_profile(url)) {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[property=og:title]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return getHost() + "_" + getGID(url) + "_" + title.trim();
} catch (IOException e) {
// Fall back to default album naming convention
LOGGER.info("Unable to find title at " + url);
}
return super.getAlbumTitle(url);
}
return url.toExternalForm().split("/u/")[1];
}
代码示例来源:origin: RipMeApp/ripme
private String vscoImageToURL(String url) throws IOException{
Document page = Jsoup.connect(url).userAgent(USER_AGENT)
.get();
//create Elements filled only with Elements with the "meta" tag.
Elements metaTags = page.getElementsByTag("meta");
String result = "";
for(Element metaTag : metaTags){
//find URL inside meta-tag with property of "og:image"
if (metaTag.attr("property").equals("og:image")){
String givenURL = metaTag.attr("content");
givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
result = givenURL;
LOGGER.debug("Found image URL: " + givenURL);
break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
}
}
//Means website changed, things need to be fixed.
if (result.isEmpty()){
LOGGER.error("Could not find image URL at: " + url);
}
return result;
}
内容来源于网络,如有侵权,请联系作者删除!