本文整理了Java中org.jsoup.nodes.Document.getElementsByTag()
方法的一些代码示例,展示了Document.getElementsByTag()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Document.getElementsByTag()
方法的具体详情如下:
包路径:org.jsoup.nodes.Document
类名称:Document
方法名:getElementsByTag
暂无
代码示例来源:origin: RipMeApp/ripme
@Override
public List<String> getURLsFromPage(Document page) {
List<String> res = new ArrayList<>(100);
for (Element e : page.getElementsByTag("post")) {
res.add(e.absUrl("file_url") + "#" + e.attr("id"));
}
return res;
}
代码示例来源:origin: org.jsoup/jsoup
/**
Get the string contents of the document's {@code title} element.
@return Trimmed title, or empty string if none set.
*/
public String title() {
// title is a preserve whitespace tag (for document output), but normalised here
Element titleEl = getElementsByTag("title").first();
return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
}
代码示例来源:origin: loklak/loklak_server
/**
* This is a helper function that helps user to extract html nested inside of html script
* @param raw_html
* @return nested html String
*/
private static String getNestedHtml(String raw_html){
String html = raw_html.replace("\\","");
Document doc = Jsoup.parse(html);
//get the script tag
Elements scripts = doc.getElementsByTag("script");
//pattern for extracting html
Pattern pttrn = Pattern.compile("\"html\":\"");
String nested_html = "";
for (Element script:scripts){
Matcher m = pttrn.matcher(html = script.html());
if(m.find()){
nested_html += html.substring(m.end(), html.length() -3);
}
}
return nested_html;
}
代码示例来源:origin: decaywood/XueQiuSuperSpider
private JsonNode parseHtmlToJsonNode(String content) throws IOException {
Document doc = Jsoup.parse(content);
String indexer1 = "follows=";
String indexer2 = ";seajs.use";
StringBuilder builder = new StringBuilder(
doc.getElementsByTag("script")
.get(15)
.dataNodes()
.get(0)
.attr("data"));
int index = builder.indexOf(indexer1);
builder.delete(0, index + indexer1.length());
index = builder.indexOf(indexer2);
builder.delete(index, builder.length());
return mapper.readTree(builder.toString());
}
}
代码示例来源:origin: RipMeApp/ripme
@Override
public Document getNextPage(Document doc) throws IOException {
int offset = Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset"));
int num = Integer.parseInt(doc.getElementsByTag("posts").first().attr("count"));
if (offset + 100 > num) {
return null;
}
return Http.url(getPage(offset / 100 + 1)).get();
}
代码示例来源:origin: RipMeApp/ripme
private String vscoImageToURL(String url) throws IOException{
Document page = Jsoup.connect(url).userAgent(USER_AGENT)
.get();
//create Elements filled only with Elements with the "meta" tag.
Elements metaTags = page.getElementsByTag("meta");
String result = "";
for(Element metaTag : metaTags){
//find URL inside meta-tag with property of "og:image"
if (metaTag.attr("property").equals("og:image")){
String givenURL = metaTag.attr("content");
givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
result = givenURL;
LOGGER.debug("Found image URL: " + givenURL);
break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
}
}
//Means website changed, things need to be fixed.
if (result.isEmpty()){
LOGGER.error("Could not find image URL at: " + url);
}
return result;
}
代码示例来源:origin: RipMeApp/ripme
private String getImageFromPost(String url) {
sleep(1000);
Document d = null;
try {
d = Http.url(url).cookies(cookies).get();
Elements links = d.getElementsByTag("a");
for (Element link : links) {
if (link.text().equals("Download")) {
LOGGER.info("Found image " + link.attr("href"));
return "https:" + link.attr("href");
}
}
} catch (IOException e) {
return null;
}
return null;
}
代码示例来源:origin: iMeiji/Toutiao
Document doc = Jsoup.parse(HTML);
Elements scripts = doc.getElementsByTag("script");
for (Element e : scripts) {
代码示例来源:origin: loklak/loklak_server
/**
* Method to match the given pattern with extracted elements of html page
* and parse the result for the posts on the given instagram page
* @return instaProfile as a JSONArray object containing all posts and details of viewer
*/
public JSONArray scrapeInstagram(BufferedReader br, String url) {
Document htmlPage = null;
Post instaObj = null;
JSONArray instaProfile = new JSONArray();
try {
htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
} catch (IOException e) {
DAO.trace(e);
}
String script = htmlPage.getElementsByTag("script").html();
Matcher m = instaJsonData.matcher(script);
m.find();
int start = m.start(1);
int end = m.start(2) + 1;
script = script.substring(start, end);
//TODO: pre-process the posts captured. At present, complete array of posts are output.
//Only useful data shall be outputted.
instaObj = new Post(script, this.query);
instaProfile.put(instaObj);
return instaProfile;
}
代码示例来源:origin: RipMeApp/ripme
Document doc = Http.url(url).get();
Elements metaTags = doc.getElementsByTag("meta");
代码示例来源:origin: org.jsoup/jsoup
/**
Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
not present
@param title string to set as title
*/
public void title(String title) {
Validate.notNull(title);
Element titleEl = getElementsByTag("title").first();
if (titleEl == null) { // add to head
head().appendElement("title").text(title);
} else {
titleEl.text(title);
}
}
代码示例来源:origin: org.jsoup/jsoup
private void normaliseStructure(String tag, Element htmlEl) {
Elements elements = this.getElementsByTag(tag);
Element master = elements.first(); // will always be available as created above if not existent
if (elements.size() > 1) { // dupes, move contents to master
List<Node> toMove = new ArrayList<>();
for (int i = 1; i < elements.size(); i++) {
Node dupe = elements.get(i);
toMove.addAll(dupe.ensureChildNodes());
dupe.remove();
}
for (Node dupe : toMove)
master.appendChild(dupe);
}
// ensure parented by <html>
if (!master.parent().equals(htmlEl)) {
htmlEl.appendChild(master); // includes remove()
}
}
代码示例来源:origin: RipMeApp/ripme
public static List<URL> getURLs(URL url) throws IOException{
Response resp = Http.url(url)
.ignoreContentType()
.response();
Document doc = resp.parse();
List<URL> URLs = new ArrayList<>();
//Pictures
Elements imgs = doc.getElementsByTag("img");
for (Element img : imgs) {
if (img.hasClass("album-image")) {
String imageURL = img.attr("src");
URLs.add(new URL(imageURL));
}
}
//Videos
Elements vids = doc.getElementsByTag("video");
for (Element vid : vids) {
if (vid.hasClass("album-video")) {
Elements source = vid.getElementsByTag("source");
String videoURL = source.first().attr("src");
URLs.add(new URL(videoURL));
}
}
return URLs;
}
}
代码示例来源:origin: RipMeApp/ripme
List<String> URLs = new ArrayList<>();
Elements imgs = doc.getElementsByTag("img");
for (Element img : imgs) {
if (img.hasClass("album-image")) {
Elements vids = doc.getElementsByTag("video");
for (Element vid : vids) {
if (vid.hasClass("album-video")) {
e.printStackTrace();
Elements profile_vids = video_page.getElementsByTag("video");
for (Element vid : profile_vids) {
if (vid.hasClass("album-video")) {
代码示例来源:origin: loklak/loklak_server
Elements t;
eventID = htmlPage.getElementsByTag("body").attr("data-event-id");
eventName = htmlPage.getElementsByClass("listing-hero-body").text();
eventDescription = htmlPage.select("div.js-xd-read-more-toggle-view.read-more__toggle-view").text();
imageLink = htmlPage.getElementsByTag("picture").attr("content");
代码示例来源:origin: loklak/loklak_server
articles = blogHTML.getElementsByTag("article");
for (Element article : articles) {
blogPost = new Post();
代码示例来源:origin: 4pr0n/ripme
@Override
public List<String> getURLsFromPage(Document page) {
List<String> res = new ArrayList<String>(100);
for (Element e : page.getElementsByTag("post")) {
res.add(e.absUrl("file_url") + "#" + e.attr("id"));
}
return res;
}
代码示例来源:origin: spring-projects/spring-roo
/**
* Checks if a complete document is user managed checking if the root tag has the attribute 'data-z="user-managed"'
*
* @param document
* @return boolean
*/
@Override
protected boolean isUserManagedDocument(Document document) {
Elements match = document.getElementsByTag("html");
if (match != null && match.size() > 0) {
Element root = match.get(0);
if (root != null && root.hasAttr("data-z") && root.attr("data-z").equals("user-managed")) {
return true;
}
} else {
return false;
}
return false;
}
}
代码示例来源:origin: 4pr0n/ripme
@Override
public Document getNextPage(Document doc) throws IOException {
int offset = Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset"));
int num = Integer.parseInt(doc.getElementsByTag("posts").first().attr("count"));
if (offset + 100 > num) {
return null;
}
return Http.url(getPage(offset / 100 + 1)).get();
}
代码示例来源:origin: jbake-org/jbake
/**
* Image paths are specified as w.r.t. assets folder. This function prefix site host to all img src except
* the ones that starts with http://, https://.
* <p>
* If image path starts with "./", i.e. relative to the source file, then it first replace that with output file directory and the add site host.
*
* @param fileContents Map representing file contents
* @param configuration Configuration object
*/
public static void fixImageSourceUrls(Map<String, Object> fileContents, JBakeConfiguration configuration) {
String htmlContent = fileContents.get(Attributes.BODY).toString();
boolean prependSiteHost = configuration.getImgPathPrependHost();
String siteHost = configuration.getSiteHost();
String uri = getDocumentUri(fileContents);
Document document = Jsoup.parseBodyFragment(htmlContent);
Elements allImgs = document.getElementsByTag("img");
for (Element img : allImgs) {
transformImageSource(img, uri, siteHost, prependSiteHost);
}
//Use body().html() to prevent adding <body></body> from parsed fragment.
fileContents.put(Attributes.BODY, document.body().html());
}
内容来源于网络,如有侵权,请联系作者删除!