org.jsoup.nodes.Document类的使用及代码示例

x33g5p2x  于2022-01-18 转载在 其他  
字(11.7k)|赞(0)|评价(0)|浏览(539)

本文整理了Java中org.jsoup.nodes.Document类的一些代码示例,展示了Document类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Document类的具体详情如下:
包路径:org.jsoup.nodes.Document
类名称:Document

Document介绍

[英]A HTML Document.
[中]HTML文档。

代码示例

代码示例来源:origin: RipMeApp/ripme

@Override
public Document getNextPage(Document page) throws IOException {
  Elements nextPageLink = page.select("li.page_next > a");
  if (nextPageLink.isEmpty()){
    throw new IOException("No more pages");
  } else {
    URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
    return Http.url(nextURL).get();
  }
}

代码示例来源:origin: JpressProjects/jpress

/**
 * 让html的图片变成绝对路径,这在api请求文章数据的时候,方便客户端直接浏览
 *
 * @param html
 * @param domain
 * @return
 */
public static String makeImageSrcToAbsolutePath(String html, String domain) {
  if (StrUtils.isBlank(domain)) {
    return html;
  }
  Document doc = Jsoup.parse(html);
  Elements es = doc.select("img");
  if (es != null && es.size() > 0) {
    for (Element e : es) {
      String src = e.attr("src");
      if (StrUtils.isNotBlank(src) && src.startsWith("/")) {
        src = domain + src;
        e.attr("src", src);
      }
    }
  }
  return doc.body().children().toString();
}

代码示例来源:origin: org.jsoup/jsoup

private void normaliseStructure(String tag, Element htmlEl) {
  Elements elements = this.getElementsByTag(tag);
  Element master = elements.first(); // will always be available as created above if not existent
  if (elements.size() > 1) { // dupes, move contents to master
    List<Node> toMove = new ArrayList<>();
    for (int i = 1; i < elements.size(); i++) {
      Node dupe = elements.get(i);
      toMove.addAll(dupe.ensureChildNodes());
      dupe.remove();
    }
    for (Node dupe : toMove)
      master.appendChild(dupe);
  }
  // ensure parented by <html>
  if (!master.parent().equals(htmlEl)) {
    htmlEl.appendChild(master); // includes remove()            
  }
}

代码示例来源:origin: square/retrofit

@Override public Page convert(ResponseBody responseBody) throws IOException {
  Document document = Jsoup.parse(responseBody.string());
  List<String> links = new ArrayList<>();
  for (Element element : document.select("a[href]")) {
   links.add(element.attr("href"));
  }
  return new Page(document.title(), Collections.unmodifiableList(links));
 }
}

代码示例来源:origin: org.jsoup/jsoup

/**
 Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 @param dirtyDocument Untrusted base document to clean.
 @return cleaned document.
 */
public Document clean(Document dirtyDocument) {
  Validate.notNull(dirtyDocument);
  Document clean = Document.createShell(dirtyDocument.baseUri());
  if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
    copySafeNodes(dirtyDocument.body(), clean.body());
  return clean;
}

代码示例来源:origin: RipMeApp/ripme

private List<String> getURLsFromChap(Document doc) {
  LOGGER.debug("Getting urls from " + doc.location());
  List<String> result = new ArrayList<>();
  for (Element el : doc.select(".vung-doc > img")) {
    result.add(el.attr("src"));
  }
  return result;
}

代码示例来源:origin: RipMeApp/ripme

private JSONObject getJSON(String page, String apiKey) {
  URL pageURL = null;
  String apiURL = null;
  try {
    apiURL = apiURLBuilder(getPhotosetID(url.toExternalForm()), page, apiKey);
    pageURL = new URL(apiURL);
  }  catch (MalformedURLException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
  }
  try {
    LOGGER.info(Http.url(pageURL).ignoreContentType().get().text());
    return new JSONObject(Http.url(pageURL).ignoreContentType().get().text());
  } catch (IOException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
    return null;
  }
}

代码示例来源:origin: RipMeApp/ripme

Document doc = Http.url(url).get();
  Elements metaTags = doc.getElementsByTag("meta");
    if (metaTag.attr("property").equals("og:image")) {
      imgsrc = metaTag.attr("content");
      LOGGER.info("Found URL " + imgsrc);
      break;//only one (useful) image possible for an "image page".
    LOGGER.warn("Image not found at " + this.url);
    return;
  addURLToDownload(new URL(imgsrc), prefix);
} catch (IOException e) {
  LOGGER.error("[!] Exception while loading/parsing " + this.url, e);

代码示例来源:origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("    Retrieving " + this.url.toExternalForm());
    Document doc = Http.url(this.url).get();
    Elements videos = doc.select("meta[name=twitter:player:stream]");
    if (videos.isEmpty()) {
      throw new IOException("Could not find twitter:player:stream at " + url);
    }
    String vidUrl = videos.first().attr("content");
    vidUrl = vidUrl.replaceAll("&amp;", "&");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源:origin: RipMeApp/ripme

@Override
public List<String> getURLsFromPage(Document doc) {
  List<String> results = new ArrayList<>();
  String duckMoviesUrl = doc.select("iframe").attr("src");
  try {
    Document duckDoc = Http.url(new URL(duckMoviesUrl)).get();
    String videoURL = duckDoc.select("source").attr("src");
    // remove any white spaces so we can download the movie without a 400 error
    videoURL = videoURL.replaceAll(" ", "%20");
    results.add(videoURL);
  } catch (MalformedURLException e) {
    LOGGER.error(duckMoviesUrl + " is not a valid url");
  } catch (IOException e) {
    LOGGER.error("Unable to load page " + duckMoviesUrl);
    e.printStackTrace();
  }
  return results;
}

代码示例来源:origin: RipMeApp/ripme

private void fetchImage() {
    try {
      Document doc = Http.url(this.url)
                .referrer(this.url)
                .get();
      // Find image
      Elements images = doc.select("#photoImageSection img");
      Element image = images.first();
      String imgsrc = image.attr("src");
      LOGGER.info("Found URL " + imgsrc + " via " + images.get(0));
      // Provide prefix and let the AbstractRipper "guess" the filename
      String prefix = "";
      if (Utils.getConfigBoolean("download.save_order", true)) {
        prefix = String.format("%03d_", index);
      }
      URL imgurl = new URL(url, imgsrc);
      addURLToDownload(imgurl, prefix);
    } catch (IOException e) {
      LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
    }
  }
}

代码示例来源:origin: RipMeApp/ripme

@Override
public List<String> getURLsFromPage(Document doc) {
  LOGGER.debug("Checking for urls");
  List<String> result = new ArrayList<>();
  if (!isVideoUrl(url)) {
   for (Element page : doc.select("div.items > div.item-container > a.item")) {
     String pageWithImageUrl = page.attr("href");
     try {
       String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src");
       downloadFile(image);
     } catch (IOException e) {
       LOGGER.error("Was unable to load page " + pageWithImageUrl);
     }
   }
  } else {
    String imgUrl = doc.select("div.player-container > a").attr("href");
    downloadFile(imgUrl);
  }
  return result;
}

代码示例来源:origin: RipMeApp/ripme

private URL getGalleryFromImage(URL url) throws IOException {
  Document doc = Http.url(url).get();
  for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
    LOGGER.info("LINK: " + link.toString());
    if (link.hasAttr("href")
        && link.attr("href").contains("gallery.php")) {
      url = new URL("http://imagearn.com/" + link.attr("href"));
      LOGGER.info("[!] Found gallery from given link: " + url);
      return url;
    }
  }
  throw new IOException("Failed to find gallery at URL " + url);
}

代码示例来源:origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
      throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
      Matcher m = p.matcher(element.data());
      if (m.find()){
        String vidUrl = m.group(1);
        addURLToDownload(new URL(vidUrl), HOST + "_" + title);
      }
    }
    waitForThreads();
  }
}

代码示例来源:origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    Elements videos = doc.select(".wp-video > video > source");
    if (videos.isEmpty()) {
      throw new IOException("Could not find Embed code at " + url);
    }
    String vidUrl = videos.attr("src");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源:origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    List<String> mp4s = Utils.between(doc.html(), "file:\"", "\"");
    if (mp4s.isEmpty()) {
      throw new IOException("Could not find files at " + url);
    }
    String vidUrl = mp4s.get(0);
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源:origin: RipMeApp/ripme

public static List<URL> getURLs(URL url) throws IOException{

    Response resp = Http.url(url)
              .ignoreContentType()
              .response();

    Document doc = resp.parse();

    List<URL> URLs = new ArrayList<>();
    //Pictures
    Elements imgs = doc.getElementsByTag("img");
    for (Element img : imgs) {
      if (img.hasClass("album-image")) {
        String imageURL = img.attr("src");
        URLs.add(new URL(imageURL));
      }
    }
    //Videos
    Elements vids = doc.getElementsByTag("video");
    for (Element vid : vids) {
      if (vid.hasClass("album-video")) {
        Elements source = vid.getElementsByTag("source");
        String videoURL = source.first().attr("src");
        URLs.add(new URL(videoURL));
      }
    }

    return URLs;
  }
}

代码示例来源:origin: loklak/loklak_server

/**
 * Article API
 * @param URL
 * @param JSONObject genericScraperData
 * @return genericScraperData
 */
public JSONObject articleAPI (String url, JSONObject genericScraperData) throws MalformedURLException{
  URL qurl = new URL(url);
  String data = "";
  try {
    data = null;// ArticleExtractor.INSTANCE.getText(qurl);
    genericScraperData.put("query", qurl);
    genericScraperData.put("data", data);
    genericScraperData.put("NLP", "true");
  }
  catch (Exception e) {
    if ("".equals(data)) {
      try {
        Document htmlPage = Jsoup.connect(url).get();
        data = htmlPage.text();
        genericScraperData.put("query", qurl);
        genericScraperData.put("data", data);
        genericScraperData.put("NLP", "false");
      } catch (Exception ex) {}
    }
  }
  return genericScraperData;
}

代码示例来源:origin: RipMeApp/ripme

@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
  if (!is_profile(url)) {
    try {
      // Attempt to use album title as GID
      Element titleElement = getFirstPage().select("meta[property=og:title]").first();
      String title = titleElement.attr("content");
      title = title.substring(title.lastIndexOf('/') + 1);
      return getHost() + "_" + getGID(url) + "_" + title.trim();
    } catch (IOException e) {
      // Fall back to default album naming convention
      LOGGER.info("Unable to find title at " + url);
    }
    return super.getAlbumTitle(url);
  }
  return url.toExternalForm().split("/u/")[1];
}

代码示例来源:origin: RipMeApp/ripme

private String vscoImageToURL(String url) throws IOException{
  Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                   .get();
  //create Elements filled only with Elements with the "meta" tag.
  Elements metaTags = page.getElementsByTag("meta");
  String result = "";
  for(Element metaTag : metaTags){
    //find URL inside meta-tag with property of "og:image"
    if (metaTag.attr("property").equals("og:image")){
      String givenURL = metaTag.attr("content");
      givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
      
      result = givenURL;
      LOGGER.debug("Found image URL: " + givenURL);
      break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
    }
  }
  
  //Means website changed, things need to be fixed.
  if (result.isEmpty()){
    LOGGER.error("Could not find image URL at: " + url);
  }
  
  return result;
  
}

相关文章