org.jsoup.nodes.Document.getElementsByTag()方法的使用及代码示例

x33g5p2x  于2022-01-18 转载在 其他  
字(9.8k)|赞(0)|评价(0)|浏览(282)

本文整理了Java中org.jsoup.nodes.Document.getElementsByTag()方法的一些代码示例,展示了Document.getElementsByTag()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Document.getElementsByTag()方法的具体详情如下:
包路径:org.jsoup.nodes.Document
类名称:Document
方法名:getElementsByTag

Document.getElementsByTag介绍

暂无

代码示例

代码示例来源:origin: RipMeApp/ripme

@Override
public List<String> getURLsFromPage(Document page) {
  List<String> res = new ArrayList<>(100);
  for (Element e : page.getElementsByTag("post")) {
    res.add(e.absUrl("file_url") + "#" + e.attr("id"));
  }
  return res;
}

代码示例来源:origin: org.jsoup/jsoup

/**
 Get the string contents of the document's {@code title} element.
 @return Trimmed title, or empty string if none set.
 */
public String title() {
  // title is a preserve whitespace tag (for document output), but normalised here
  Element titleEl = getElementsByTag("title").first();
  return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
}

代码示例来源:origin: loklak/loklak_server

/**
 * This is a helper function that helps user to extract html nested inside of html script
 * @param raw_html
 * @return nested html String
 */
private static String getNestedHtml(String raw_html){
  String html = raw_html.replace("\\","");
  Document doc = Jsoup.parse(html);
  //get the script tag
  Elements scripts = doc.getElementsByTag("script");
  //pattern for extracting html
  Pattern pttrn = Pattern.compile("\"html\":\"");
  String nested_html = "";
  for (Element script:scripts){
    Matcher m =  pttrn.matcher(html = script.html());
    if(m.find()){
      nested_html += html.substring(m.end(), html.length() -3);
    }
  }
  return nested_html;
  }

代码示例来源:origin: decaywood/XueQiuSuperSpider

private JsonNode parseHtmlToJsonNode(String content) throws IOException {

    Document doc = Jsoup.parse(content);
    String indexer1 = "follows=";
    String indexer2 = ";seajs.use";
    StringBuilder builder = new StringBuilder(
        doc.getElementsByTag("script")
        .get(15)
        .dataNodes()
        .get(0)
        .attr("data"));
    int index = builder.indexOf(indexer1);
    builder.delete(0, index + indexer1.length());
    index = builder.indexOf(indexer2);
    builder.delete(index, builder.length());
    return mapper.readTree(builder.toString());

  }
}

代码示例来源:origin: RipMeApp/ripme

@Override
public Document getNextPage(Document doc) throws IOException {
  int offset = Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset"));
  int num = Integer.parseInt(doc.getElementsByTag("posts").first().attr("count"));
  if (offset + 100 > num) {
    return null;
  }
  return Http.url(getPage(offset / 100 + 1)).get();
}

代码示例来源:origin: RipMeApp/ripme

private String vscoImageToURL(String url) throws IOException{
  Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                   .get();
  //create Elements filled only with Elements with the "meta" tag.
  Elements metaTags = page.getElementsByTag("meta");
  String result = "";
  for(Element metaTag : metaTags){
    //find URL inside meta-tag with property of "og:image"
    if (metaTag.attr("property").equals("og:image")){
      String givenURL = metaTag.attr("content");
      givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
      
      result = givenURL;
      LOGGER.debug("Found image URL: " + givenURL);
      break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
    }
  }
  
  //Means website changed, things need to be fixed.
  if (result.isEmpty()){
    LOGGER.error("Could not find image URL at: " + url);
  }
  
  return result;
  
}

代码示例来源:origin: RipMeApp/ripme

private String getImageFromPost(String url) {
  sleep(1000);
  Document d = null;
  try {
    d = Http.url(url).cookies(cookies).get();
    Elements links = d.getElementsByTag("a");
    for (Element link : links) {
      if (link.text().equals("Download")) {
        LOGGER.info("Found image " + link.attr("href"));
        return "https:" + link.attr("href");
      }
    }
  } catch (IOException e) {
    return null;
  }
  return null;
}

代码示例来源:origin: iMeiji/Toutiao

Document doc = Jsoup.parse(HTML);
Elements scripts = doc.getElementsByTag("script");
for (Element e : scripts) {

代码示例来源:origin: loklak/loklak_server

/**
 * Method to match the given pattern with extracted elements of html page
 * and parse the result for the posts on the given instagram page
 * @return instaProfile as a JSONArray object containing all posts and details of viewer
 */
public JSONArray scrapeInstagram(BufferedReader br, String url) {
  Document htmlPage = null;
  Post instaObj = null;
  JSONArray instaProfile = new JSONArray();
  try {
    htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
  } catch (IOException e) {
    DAO.trace(e);
  }
  String script = htmlPage.getElementsByTag("script").html();
  Matcher m = instaJsonData.matcher(script);
  m.find();
  int start = m.start(1);
  int end = m.start(2) + 1;
  script = script.substring(start, end);
  //TODO: pre-process the posts captured. At present, complete array of posts are output.
  //Only useful data shall be outputted.
  instaObj = new Post(script, this.query);
  instaProfile.put(instaObj);
  return instaProfile;
}

代码示例来源:origin: RipMeApp/ripme

Document doc = Http.url(url).get();
Elements metaTags = doc.getElementsByTag("meta");

代码示例来源:origin: org.jsoup/jsoup

/**
 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
 not present
 @param title string to set as title
 */
public void title(String title) {
  Validate.notNull(title);
  Element titleEl = getElementsByTag("title").first();
  if (titleEl == null) { // add to head
    head().appendElement("title").text(title);
  } else {
    titleEl.text(title);
  }
}

代码示例来源:origin: org.jsoup/jsoup

private void normaliseStructure(String tag, Element htmlEl) {
  Elements elements = this.getElementsByTag(tag);
  Element master = elements.first(); // will always be available as created above if not existent
  if (elements.size() > 1) { // dupes, move contents to master
    List<Node> toMove = new ArrayList<>();
    for (int i = 1; i < elements.size(); i++) {
      Node dupe = elements.get(i);
      toMove.addAll(dupe.ensureChildNodes());
      dupe.remove();
    }
    for (Node dupe : toMove)
      master.appendChild(dupe);
  }
  // ensure parented by <html>
  if (!master.parent().equals(htmlEl)) {
    htmlEl.appendChild(master); // includes remove()            
  }
}

代码示例来源:origin: RipMeApp/ripme

public static List<URL> getURLs(URL url) throws IOException{

    Response resp = Http.url(url)
              .ignoreContentType()
              .response();

    Document doc = resp.parse();

    List<URL> URLs = new ArrayList<>();
    //Pictures
    Elements imgs = doc.getElementsByTag("img");
    for (Element img : imgs) {
      if (img.hasClass("album-image")) {
        String imageURL = img.attr("src");
        URLs.add(new URL(imageURL));
      }
    }
    //Videos
    Elements vids = doc.getElementsByTag("video");
    for (Element vid : vids) {
      if (vid.hasClass("album-video")) {
        Elements source = vid.getElementsByTag("source");
        String videoURL = source.first().attr("src");
        URLs.add(new URL(videoURL));
      }
    }

    return URLs;
  }
}

代码示例来源:origin: RipMeApp/ripme

List<String> URLs = new ArrayList<>();
Elements imgs = doc.getElementsByTag("img");
for (Element img : imgs) {
  if (img.hasClass("album-image")) {
Elements vids = doc.getElementsByTag("video");
for (Element vid : vids) {
  if (vid.hasClass("album-video")) {
    e.printStackTrace();
  Elements profile_vids = video_page.getElementsByTag("video");
  for (Element vid : profile_vids) {
    if (vid.hasClass("album-video")) {

代码示例来源:origin: loklak/loklak_server

Elements t;
eventID = htmlPage.getElementsByTag("body").attr("data-event-id");
eventName = htmlPage.getElementsByClass("listing-hero-body").text();
eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text();
imageLink = htmlPage.getElementsByTag("picture").attr("content");

代码示例来源:origin: loklak/loklak_server

articles = blogHTML.getElementsByTag("article");
for (Element article : articles) {
  blogPost = new Post();

代码示例来源:origin: 4pr0n/ripme

@Override
public List<String> getURLsFromPage(Document page) {
  List<String> res = new ArrayList<String>(100);
  for (Element e : page.getElementsByTag("post")) {
    res.add(e.absUrl("file_url") + "#" + e.attr("id"));
  }
  return res;
}

代码示例来源:origin: spring-projects/spring-roo

/**
  * Checks if a complete document is user managed checking if the root tag has the attribute 'data-z="user-managed"'
  *
  * @param document
  * @return boolean
  */
 @Override
 protected boolean isUserManagedDocument(Document document) {

  Elements match = document.getElementsByTag("html");

  if (match != null && match.size() > 0) {
   Element root = match.get(0);
   if (root != null && root.hasAttr("data-z") && root.attr("data-z").equals("user-managed")) {
    return true;
   }
  } else {
   return false;
  }
  return false;
 }
}

代码示例来源:origin: 4pr0n/ripme

@Override
public Document getNextPage(Document doc) throws IOException {
  int offset = Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset"));
  int num = Integer.parseInt(doc.getElementsByTag("posts").first().attr("count"));
  if (offset + 100 > num) {
    return null;
  }
  return Http.url(getPage(offset / 100 + 1)).get();
}

代码示例来源:origin: jbake-org/jbake

/**
 * Image paths are specified as w.r.t. assets folder. This function prefix site host to all img src except
 * the ones that starts with http://, https://.
 * <p>
 * If image path starts with "./", i.e. relative to the source file, then it first replace that with output file directory and the add site host.
 *
 * @param fileContents  Map representing file contents
 * @param configuration Configuration object
 */
public static void fixImageSourceUrls(Map<String, Object> fileContents, JBakeConfiguration configuration) {
  String htmlContent = fileContents.get(Attributes.BODY).toString();
  boolean prependSiteHost = configuration.getImgPathPrependHost();
  String siteHost = configuration.getSiteHost();
  String uri = getDocumentUri(fileContents);
  Document document = Jsoup.parseBodyFragment(htmlContent);
  Elements allImgs = document.getElementsByTag("img");
  for (Element img : allImgs) {
    transformImageSource(img, uri, siteHost, prependSiteHost);
  }
  //Use body().html() to prevent adding <body></body> from parsed fragment.
  fileContents.put(Attributes.BODY, document.body().html());
}

相关文章