org.jsoup.nodes.Document类的使用及代码示例

x33g5p2x 于2022-01-18 转载在其他

字(11.7k)|赞(0)|评价(0)|浏览(539)

本文整理了Java中org.jsoup.nodes.Document类的一些代码示例，展示了Document类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台，是从一些精选项目中提取出来的代码，具有较强的参考意义，能在一定程度帮忙到你。Document类的具体详情如下：
包路径：org.jsoup.nodes.Document
类名称：Document

Document介绍

[英]A HTML Document.
[中]HTML文档。

代码示例

代码示例来源：origin: RipMeApp/ripme

@Override
public Document getNextPage(Document page) throws IOException {
  Elements nextPageLink = page.select("li.page_next > a");
  if (nextPageLink.isEmpty()){
    throw new IOException("No more pages");
  } else {
    URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
    return Http.url(nextURL).get();
  }
}

代码示例来源：origin: JpressProjects/jpress

/**
 * 让html的图片变成绝对路径，这在api请求文章数据的时候，方便客户端直接浏览
 *
 * @param html
 * @param domain
 * @return
 */
public static String makeImageSrcToAbsolutePath(String html, String domain) {
  if (StrUtils.isBlank(domain)) {
    return html;
  }
  Document doc = Jsoup.parse(html);
  Elements es = doc.select("img");
  if (es != null && es.size() > 0) {
    for (Element e : es) {
      String src = e.attr("src");
      if (StrUtils.isNotBlank(src) && src.startsWith("/")) {
        src = domain + src;
        e.attr("src", src);
      }
    }
  }
  return doc.body().children().toString();
}

代码示例来源：origin: org.jsoup/jsoup

private void normaliseStructure(String tag, Element htmlEl) {
  Elements elements = this.getElementsByTag(tag);
  Element master = elements.first(); // will always be available as created above if not existent
  if (elements.size() > 1) { // dupes, move contents to master
    List<Node> toMove = new ArrayList<>();
    for (int i = 1; i < elements.size(); i++) {
      Node dupe = elements.get(i);
      toMove.addAll(dupe.ensureChildNodes());
      dupe.remove();
    }
    for (Node dupe : toMove)
      master.appendChild(dupe);
  }
  // ensure parented by <html>
  if (!master.parent().equals(htmlEl)) {
    htmlEl.appendChild(master); // includes remove()            
  }
}

代码示例来源：origin: square/retrofit

@Override public Page convert(ResponseBody responseBody) throws IOException {
  Document document = Jsoup.parse(responseBody.string());
  List<String> links = new ArrayList<>();
  for (Element element : document.select("a[href]")) {
   links.add(element.attr("href"));
  }
  return new Page(document.title(), Collections.unmodifiableList(links));
 }
}

代码示例来源：origin: org.jsoup/jsoup

/**
 Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 @param dirtyDocument Untrusted base document to clean.
 @return cleaned document.
 */
public Document clean(Document dirtyDocument) {
  Validate.notNull(dirtyDocument);
  Document clean = Document.createShell(dirtyDocument.baseUri());
  if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
    copySafeNodes(dirtyDocument.body(), clean.body());
  return clean;
}

代码示例来源：origin: RipMeApp/ripme

private List<String> getURLsFromChap(Document doc) {
  LOGGER.debug("Getting urls from " + doc.location());
  List<String> result = new ArrayList<>();
  for (Element el : doc.select(".vung-doc > img")) {
    result.add(el.attr("src"));
  }
  return result;
}

代码示例来源：origin: RipMeApp/ripme

private JSONObject getJSON(String page, String apiKey) {
  URL pageURL = null;
  String apiURL = null;
  try {
    apiURL = apiURLBuilder(getPhotosetID(url.toExternalForm()), page, apiKey);
    pageURL = new URL(apiURL);
  }  catch (MalformedURLException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
  }
  try {
    LOGGER.info(Http.url(pageURL).ignoreContentType().get().text());
    return new JSONObject(Http.url(pageURL).ignoreContentType().get().text());
  } catch (IOException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
    return null;
  }
}

代码示例来源：origin: RipMeApp/ripme

Document doc = Http.url(url).get();
  Elements metaTags = doc.getElementsByTag("meta");
    if (metaTag.attr("property").equals("og:image")) {
      imgsrc = metaTag.attr("content");
      LOGGER.info("Found URL " + imgsrc);
      break;//only one (useful) image possible for an "image page".
    LOGGER.warn("Image not found at " + this.url);
    return;
  addURLToDownload(new URL(imgsrc), prefix);
} catch (IOException e) {
  LOGGER.error("[!] Exception while loading/parsing " + this.url, e);

代码示例来源：origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("    Retrieving " + this.url.toExternalForm());
    Document doc = Http.url(this.url).get();
    Elements videos = doc.select("meta[name=twitter:player:stream]");
    if (videos.isEmpty()) {
      throw new IOException("Could not find twitter:player:stream at " + url);
    }
    String vidUrl = videos.first().attr("content");
    vidUrl = vidUrl.replaceAll("&amp;", "&");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源：origin: RipMeApp/ripme

@Override
public List<String> getURLsFromPage(Document doc) {
  List<String> results = new ArrayList<>();
  String duckMoviesUrl = doc.select("iframe").attr("src");
  try {
    Document duckDoc = Http.url(new URL(duckMoviesUrl)).get();
    String videoURL = duckDoc.select("source").attr("src");
    // remove any white spaces so we can download the movie without a 400 error
    videoURL = videoURL.replaceAll(" ", "%20");
    results.add(videoURL);
  } catch (MalformedURLException e) {
    LOGGER.error(duckMoviesUrl + " is not a valid url");
  } catch (IOException e) {
    LOGGER.error("Unable to load page " + duckMoviesUrl);
    e.printStackTrace();
  }
  return results;
}

代码示例来源：origin: RipMeApp/ripme

private void fetchImage() {
    try {
      Document doc = Http.url(this.url)
                .referrer(this.url)
                .get();
      // Find image
      Elements images = doc.select("#photoImageSection img");
      Element image = images.first();
      String imgsrc = image.attr("src");
      LOGGER.info("Found URL " + imgsrc + " via " + images.get(0));
      // Provide prefix and let the AbstractRipper "guess" the filename
      String prefix = "";
      if (Utils.getConfigBoolean("download.save_order", true)) {
        prefix = String.format("%03d_", index);
      }
      URL imgurl = new URL(url, imgsrc);
      addURLToDownload(imgurl, prefix);
    } catch (IOException e) {
      LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
    }
  }
}

代码示例来源：origin: RipMeApp/ripme

@Override
public List<String> getURLsFromPage(Document doc) {
  LOGGER.debug("Checking for urls");
  List<String> result = new ArrayList<>();
  if (!isVideoUrl(url)) {
   for (Element page : doc.select("div.items > div.item-container > a.item")) {
     String pageWithImageUrl = page.attr("href");
     try {
       String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src");
       downloadFile(image);
     } catch (IOException e) {
       LOGGER.error("Was unable to load page " + pageWithImageUrl);
     }
   }
  } else {
    String imgUrl = doc.select("div.player-container > a").attr("href");
    downloadFile(imgUrl);
  }
  return result;
}

代码示例来源：origin: RipMeApp/ripme

private URL getGalleryFromImage(URL url) throws IOException {
  Document doc = Http.url(url).get();
  for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
    LOGGER.info("LINK: " + link.toString());
    if (link.hasAttr("href")
        && link.attr("href").contains("gallery.php")) {
      url = new URL("http://imagearn.com/" + link.attr("href"));
      LOGGER.info("[!] Found gallery from given link: " + url);
      return url;
    }
  }
  throw new IOException("Failed to find gallery at URL " + url);
}

代码示例来源：origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
      throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
      Matcher m = p.matcher(element.data());
      if (m.find()){
        String vidUrl = m.group(1);
        addURLToDownload(new URL(vidUrl), HOST + "_" + title);
      }
    }
    waitForThreads();
  }
}

代码示例来源：origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    Elements videos = doc.select(".wp-video > video > source");
    if (videos.isEmpty()) {
      throw new IOException("Could not find Embed code at " + url);
    }
    String vidUrl = videos.attr("src");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源：origin: RipMeApp/ripme

@Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    List<String> mp4s = Utils.between(doc.html(), "file:\"", "\"");
    if (mp4s.isEmpty()) {
      throw new IOException("Could not find files at " + url);
    }
    String vidUrl = mp4s.get(0);
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

代码示例来源：origin: RipMeApp/ripme

public static List<URL> getURLs(URL url) throws IOException{

    Response resp = Http.url(url)
              .ignoreContentType()
              .response();

    Document doc = resp.parse();

    List<URL> URLs = new ArrayList<>();
    //Pictures
    Elements imgs = doc.getElementsByTag("img");
    for (Element img : imgs) {
      if (img.hasClass("album-image")) {
        String imageURL = img.attr("src");
        URLs.add(new URL(imageURL));
      }
    }
    //Videos
    Elements vids = doc.getElementsByTag("video");
    for (Element vid : vids) {
      if (vid.hasClass("album-video")) {
        Elements source = vid.getElementsByTag("source");
        String videoURL = source.first().attr("src");
        URLs.add(new URL(videoURL));
      }
    }

    return URLs;
  }
}

代码示例来源：origin: loklak/loklak_server

/**
 * Article API
 * @param URL
 * @param JSONObject genericScraperData
 * @return genericScraperData
 */
public JSONObject articleAPI (String url, JSONObject genericScraperData) throws MalformedURLException{
  URL qurl = new URL(url);
  String data = "";
  try {
    data = null;// ArticleExtractor.INSTANCE.getText(qurl);
    genericScraperData.put("query", qurl);
    genericScraperData.put("data", data);
    genericScraperData.put("NLP", "true");
  }
  catch (Exception e) {
    if ("".equals(data)) {
      try {
        Document htmlPage = Jsoup.connect(url).get();
        data = htmlPage.text();
        genericScraperData.put("query", qurl);
        genericScraperData.put("data", data);
        genericScraperData.put("NLP", "false");
      } catch (Exception ex) {}
    }
  }
  return genericScraperData;
}

代码示例来源：origin: RipMeApp/ripme

@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
  if (!is_profile(url)) {
    try {
      // Attempt to use album title as GID
      Element titleElement = getFirstPage().select("meta[property=og:title]").first();
      String title = titleElement.attr("content");
      title = title.substring(title.lastIndexOf('/') + 1);
      return getHost() + "_" + getGID(url) + "_" + title.trim();
    } catch (IOException e) {
      // Fall back to default album naming convention
      LOGGER.info("Unable to find title at " + url);
    }
    return super.getAlbumTitle(url);
  }
  return url.toExternalForm().split("/u/")[1];
}

代码示例来源：origin: RipMeApp/ripme

private String vscoImageToURL(String url) throws IOException{
  Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                   .get();
  //create Elements filled only with Elements with the "meta" tag.
  Elements metaTags = page.getElementsByTag("meta");
  String result = "";
  for(Element metaTag : metaTags){
    //find URL inside meta-tag with property of "og:image"
    if (metaTag.attr("property").equals("og:image")){
      String givenURL = metaTag.attr("content");
      givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
      
      result = givenURL;
      LOGGER.debug("Found image URL: " + givenURL);
      break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
    }
  }
  
  //Means website changed, things need to be fixed.
  if (result.isEmpty()){
    LOGGER.error("Could not find image URL at: " + url);
  }
  
  return result;
  
}

内容来源于网络，如有侵权，请联系作者删除！

相关文章

热门标签

Java query python Node 开发语言 request Util 数据库 Table 后端算法 Logger Message Element Parser

最新文章

高级程序员和新手小白程序员区别你是那个等级看解决bug速度
浏览(479) 发布于 2个月前
还在用双层for循环吗？太慢了
浏览(455) 发布于 2个月前
我用EasyExcel优化了公司的导出（附踩坑记录）
浏览(548) 发布于 2个月前
记录因Sharding Jdbc批量操作引发的一次fullGC
浏览(392) 发布于 2个月前
进大厂必须要会的单元测试
浏览(405) 发布于 2个月前