org.jsoup.Jsoup类的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(13.3k)|赞(0)|评价(0)|浏览(302)

本文整理了Java中org.jsoup.Jsoup类的一些代码示例,展示了Jsoup类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Jsoup类的具体详情如下:
包路径:org.jsoup.Jsoup
类名称:Jsoup

Jsoup介绍

[英]The core public access point to the jsoup functionality.
[中]

代码示例

代码示例来源:origin: loklak/loklak_server

  1. /**
  2. * This is a helper function that helps user to extract html nested inside of html script
  3. * @param raw_html
  4. * @return nested html String
  5. */
  6. private static String getNestedHtml(String raw_html){
  7. String html = raw_html.replace("\\","");
  8. Document doc = Jsoup.parse(html);
  9. //get the script tag
  10. Elements scripts = doc.getElementsByTag("script");
  11. //pattern for extracting html
  12. Pattern pttrn = Pattern.compile("\"html\":\"");
  13. String nested_html = "";
  14. for (Element script:scripts){
  15. Matcher m = pttrn.matcher(html = script.html());
  16. if(m.find()){
  17. nested_html += html.substring(m.end(), html.length() -3);
  18. }
  19. }
  20. return nested_html;
  21. }

代码示例来源:origin: ChinaSilence/any-video

  1. private String getOpenId(String accessToken) throws IOException{
  2. String url = openIdUri + accessToken;
  3. Document document = Jsoup.connect(url).get();
  4. String resultText = document.text();
  5. Matcher matcher = Pattern.compile("\"openid\":\"(.*?)\"").matcher(resultText);
  6. if (matcher.find()){
  7. return matcher.group(1);
  8. }
  9. return null;
  10. }

代码示例来源:origin: JpressProjects/jpress

  1. public static String getFirstImageSrc(String html) {
  2. if (StrUtils.isBlank(html))
  3. return null;
  4. Elements es = Jsoup.parseBodyFragment(html).select("img");
  5. if (es != null && es.size() > 0) {
  6. String src = es.first().attr("src");
  7. return StrUtils.isBlank(src) ? null : src;
  8. }
  9. return null;
  10. }

代码示例来源:origin: RipMeApp/ripme

  1. if ((url.getHost().endsWith("imgur.com"))
  2. && url.toExternalForm().contains("imgur.com/a/")) {
  3. try {
  4. logger.debug("Fetching imgur album at " + url);
  5. else if (url.getHost().endsWith("imgur.com") && url.toExternalForm().contains(",")) {
  6. Pattern p = Pattern.compile("https?://i.reddituploads.com/([a-zA-Z0-9]+)\\?.*");
  7. Matcher m = p.matcher(url.toExternalForm());
  8. if (m.matches()) {
  9. logger.info("URL: " + url.toExternalForm());
  10. String u = url.toExternalForm().replaceAll("&", "&");
  11. try {
  12. Document doc = Jsoup.connect(url.toExternalForm())
  13. .userAgent(AbstractRipper.USER_AGENT)
  14. .get();
  15. for (Element el : doc.select("meta")) {
  16. if (el.attr("property").equals("og:video")) {
  17. result.add(new URL(el.attr("content")));
  18. return result;
  19. else if (el.attr("name").equals("twitter:image:src")) {
  20. result.add(new URL(el.attr("content")));
  21. return result;

代码示例来源:origin: RipMeApp/ripme

  1. Pattern p;
  2. Matcher m;
  3. p = Pattern.compile(IMAGE_PATTERN);
  4. Pattern qualP = Pattern.compile("_[0-9]+\\.(jpg|png|gif|bmp)$");
  5. Matcher qualM;
  6. m = p.matcher(fileURL.toString());
  7. if (m.matches()) {
  8. downloadURL(fileURL, date);
  9. } else {
  10. fileURL = new URL(post.getString("video_url").replaceAll("http:", "https:"));
  11. downloadURL(fileURL, date);
  12. } catch (Exception e) {
  13. Document d = Jsoup.parse(post.getString("body"));
  14. if (!d.select("img").attr("src").isEmpty()) {
  15. try {
  16. String imgSrc = d.select("img").attr("src");
  17. qualM = qualP.matcher(imgSrc);
  18. imgSrc = qualM.replaceFirst("_1280.$1");
  19. downloadURL(new URL(imgSrc), date);
  20. } catch (MalformedURLException e) {

代码示例来源:origin: loklak/loklak_server

  1. Post githubProfile = new GithubPost(profile, 0);
  2. try {
  3. html = Jsoup.parse(bufferedReaderToString(br));
  4. } catch (IOException e) {
  5. DAO.trace(e);
  6. String avatarUrl = html.getElementsByAttributeValueContaining("class", "avatar").attr("src");
  7. Pattern avatarUrlToUserId = Pattern.compile(".com\\/u\\/([0-9]+)\\?");
  8. Matcher m = avatarUrlToUserId.matcher(avatarUrl);
  9. m.find();
  10. userId = m.group(1);
  11. githubProfile.put("user_id", userId);
  12. githubProfile.put("post_type", "user");
  13. githubProfile.put("avatar_url", "https://avatars0.githubusercontent.com/u/" + userId);
  14. String email = html.getElementsByAttributeValueContaining("itemprop", "email").text();
  15. if (!email.contains("@")) {
  16. email = "";
  17. String specialLink = html.getElementsByAttributeValueContaining("itemprop", "url").text();
  18. githubProfile.put("special_link", specialLink);
  19. Elements joiningDates = html.getElementsByAttributeValueContaining("class", "dropdown-item");
  20. for (Element joiningDate: joiningDates) {
  21. String joinDate = joiningDate.attr("href");
  22. if (joinDate.contains("join")) {
  23. joinDate = joinDate.substring(joinDate.length() - 10);

代码示例来源:origin: TEAMMATES/teammates

  1. @Test
  2. public void testTimezoneDatabasesAreUpToDate() {
  3. // ensure the timezone databases are up-to-date
  4. String currentTzVersion = Jsoup.parse(browser.driver.getPageSource()).getElementById("tzversion-moment").text();
  5. browser.driver.get(IANA_TIMEZONE_DATABASE_URL);
  6. Document tzReleasePage = Jsoup.parse(browser.driver.getPageSource());
  7. String latestTzVersion = tzReleasePage.getElementById("version").text();
  8. if (!currentTzVersion.equals(latestTzVersion)) {
  9. // find the release day
  10. String releaseDateString = tzReleasePage.getElementById("date").text();
  11. Pattern datePattern = Pattern.compile("\\(Released (.+)\\)");
  12. Matcher matcher = datePattern.matcher(releaseDateString);
  13. assertTrue(matcher.find());
  14. LocalDate releaseDate = LocalDate.parse(matcher.group(1), DateTimeFormatter.ofPattern("yyyy-MM-dd"));
  15. LocalDate nowDate = Instant.now().atZone(Const.DEFAULT_TIME_ZONE).toLocalDate();
  16. assertTrue(
  17. "The timezone database version is not up-to-date for more than " + DAYS_TO_UPDATE_TZ + " days,"
  18. + " please update them according to the maintenance guide.",
  19. releaseDate.plusDays(DAYS_TO_UPDATE_TZ).isAfter(nowDate));
  20. }
  21. }

代码示例来源:origin: huxq17/SwipeCardsView

  1. private int getCount(String html) {
  2. Document doc = Jsoup.parse(html);
  3. Elements pages = doc.select("span");
  4. Element page = pages.get(10);
  5. Pattern p = Pattern.compile("[\\d*]");
  6. Matcher m = p.matcher(page.toString());
  7. StringBuffer stringBuffer = new StringBuffer();
  8. while (m.find()) {
  9. stringBuffer.append(m.group());
  10. }
  11. return Integer.parseInt(stringBuffer.toString());
  12. }

代码示例来源:origin: jeremylong/DependencyCheck

  1. if (HTML_DETECTION_PATTERN.matcher(description).find()) {
  2. desc = Jsoup.parse(description).text();
  3. } else {
  4. desc = description;

代码示例来源:origin: iMeiji/Toutiao

  1. private Boolean parseHTML(String HTML) {
  2. boolean flag = false;
  3. Document doc = Jsoup.parse(HTML);
  4. Elements scripts = doc.getElementsByTag("script");
  5. for (Element e : scripts) {
  6. String script = e.toString();
  7. if (script.contains("BASE_DATA.galleryInfo")) {
  8. script = e.childNode(0).toString();
  9. Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
  10. while (matcher.find()) {
  11. int count = matcher.groupCount();
  12. if (count >= 1) {
  13. int start = script.indexOf("(");

代码示例来源:origin: RipMeApp/ripme

  1. public static ImgurAlbum getImgurAlbum(URL url) throws IOException {
  2. String strUrl = url.toExternalForm();
  3. if (!strUrl.contains(",")) {
  4. strUrl += "/all";
  5. String newUrl = url.toExternalForm() + "/noscript";
  6. LOGGER.info(" Retrieving " + newUrl);
  7. doc = Jsoup.connect(newUrl)
  8. .userAgent(USER_AGENT)
  9. .get();
  10. for (Element thumb : doc.select("div.image")) {
  11. String image;
  12. if (!thumb.select("a.zoom").isEmpty()) {
  13. image = "http:" + thumb.select("a").attr("href");
  14. } else if (!thumb.select("img").isEmpty()) {
  15. image = "http:" + thumb.select("img").attr("src");
  16. } else {
  17. image = image.replace(".gif", ".mp4");
  18. ImgurImage imgurImage = new ImgurImage(new URL(image));
  19. imgurAlbum.addImage(imgurImage);

代码示例来源:origin: mygithuball/any-video

  1. private boolean related(String url) {
  2. try {
  3. Document document = Jsoup.connect(url).get();
  4. String reg = String.format(FRIEND_LINK_HTML_REG, appDomain, appName);
  5. String html = document.html();
  6. Matcher matcher = Pattern.compile(reg).matcher(html);
  7. return matcher.find();
  8. } catch (IOException e) {
  9. log.info("Add Friend Link Error, url:" + url);
  10. e.printStackTrace();
  11. }
  12. return false;
  13. }

代码示例来源:origin: loklak/loklak_server

  1. /**
  2. * Method to match the given pattern with extracted elements of html page
  3. * and parse the result for the posts on the given instagram page
  4. * @return instaProfile as a JSONArray object containing all posts and details of viewer
  5. */
  6. public JSONArray scrapeInstagram(BufferedReader br, String url) {
  7. Document htmlPage = null;
  8. Post instaObj = null;
  9. JSONArray instaProfile = new JSONArray();
  10. try {
  11. htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
  12. } catch (IOException e) {
  13. DAO.trace(e);
  14. }
  15. String script = htmlPage.getElementsByTag("script").html();
  16. Matcher m = instaJsonData.matcher(script);
  17. m.find();
  18. int start = m.start(1);
  19. int end = m.start(2) + 1;
  20. script = script.substring(start, end);
  21. //TODO: pre-process the posts captured. At present, complete array of posts are output.
  22. //Only useful data shall be outputted.
  23. instaObj = new Post(script, this.query);
  24. instaProfile.put(instaObj);
  25. return instaProfile;
  26. }

代码示例来源:origin: bonigarcia/webdrivermanager

  1. String driverStr = driverUrl.toString();
  2. String driverUrlContent = driverUrl.getPath();
  3. org.jsoup.nodes.Document doc = Jsoup.parse(in, null, "");
  4. Iterator<org.jsoup.nodes.Element> iterator = doc.select("a")
  5. .iterator();
  6. List<URL> urlList = new ArrayList<>();
  7. String link = iterator.next().attr("href");
  8. if (link.contains("mirror") && link.endsWith(SLASH)) {
  9. urlList.addAll(getDriversFromMirror(new URL(
  10. driverStr + link.replace(driverUrlContent, ""))));
  11. } else if (link.startsWith(driverUrlContent)

代码示例来源:origin: HubSpot/jinjava

  1. @Override
  2. public Object filter(Object object, JinjavaInterpreter interpreter, String... arg) {
  3. if (!(object instanceof String)) {
  4. return object;
  5. }
  6. String val = interpreter.renderFlat((String) object);
  7. String strippedVal = Jsoup.parseBodyFragment(val).text();
  8. String normalizedVal = WHITESPACE.matcher(strippedVal).replaceAll(" ");
  9. return normalizedVal;
  10. }

代码示例来源:origin: magefree/mage

  1. if (proxyType == ProxyType.NONE) {
  2. urlDocument = pageUrl;
  3. doc = Jsoup.connect(urlDocument).get();
  4. } else {
  5. String proxyServer = prefs.get("proxyAddress", "");
  6. int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
  7. URL url = new URL(pageUrl);
  8. Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
  9. HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
  10. tmp.append(line);
  11. doc = Jsoup.parse(String.valueOf(tmp));
  12. Elements cardsImages = doc.select("img[src^=cards/]"); // starts with cards/
  13. if (!aliasesStart.isEmpty()) {
  14. for (String text : aliasesStart) {
  15. cardsImages.addAll(doc.select("img[src^=" + text + ']'));
  16. String cardLink = cardsImage.attr("src");
  17. String cardName = null;
  18. if (cardLink.startsWith("cards/") && cardLink.endsWith(".jpg")) {

代码示例来源:origin: magefree/mage

  1. public static Document downloadHtmlDocument(String urlString) throws NumberFormatException, IOException {
  2. Preferences prefs = MageFrame.getPreferences();
  3. Connection.ProxyType proxyType = Connection.ProxyType.valueByText(prefs.get("proxyType", "None"));
  4. Document doc;
  5. if (proxyType == ProxyType.NONE) {
  6. doc = Jsoup.connect(urlString).timeout(60 * 1000).get();
  7. } else {
  8. String proxyServer = prefs.get("proxyAddress", "");
  9. int proxyPort = Integer.parseInt(prefs.get("proxyPort", "0"));
  10. URL url = new URL(urlString);
  11. Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyServer, proxyPort));
  12. HttpURLConnection uc = (HttpURLConnection) url.openConnection(proxy);
  13. uc.setConnectTimeout(10000);
  14. uc.setReadTimeout(60000);
  15. uc.connect();
  16. String line;
  17. StringBuffer tmp = new StringBuffer();
  18. BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream()));
  19. while ((line = in.readLine()) != null) {
  20. tmp.append(line);
  21. }
  22. doc = Jsoup.parse(String.valueOf(tmp));
  23. }
  24. return doc;
  25. }
  26. }

代码示例来源:origin: asciidoctor/asciidoctorj

  1. @Test
  2. public void test() throws Exception {
  3. HttpURLConnection conn = (HttpURLConnection) new URL(url, "asciidoctor").openConnection();
  4. conn.setDoOutput(true);
  5. conn.setRequestMethod("POST");
  6. conn.getOutputStream().write("Hello World".getBytes());
  7. byte[] buf = new byte[65535];
  8. try (InputStream in = conn.getInputStream()) {
  9. final Document doc = Jsoup.parse(readFull(in));
  10. final Element first = doc.body().children().first();
  11. assertEquals("div", first.tagName());
  12. assertEquals("paragraph", first.className());
  13. final Element paragraph = first.children().first();
  14. assertEquals("p", paragraph.tagName());
  15. assertEquals("Hello World", paragraph.ownText());
  16. }
  17. }

代码示例来源:origin: kriegaex/Galileo-Openbook-Cleaner

  1. private static SortedSet<String> getWebSiteURLs() throws Exception {
  2. Document webPage;
  3. Elements downloadLinks;
  4. SortedSet<String> webSiteURLs = new TreeSet<>();
  5. webPage = Jsoup.parse(new URL("https://www.rheinwerk-verlag.de/openbook/"), 10000);
  6. downloadLinks = webPage.select("a.btn-primary");
  7. for (Element link : downloadLinks) {
  8. webSiteURLs.add(link.attr("href").replaceAll(".*/", ""));
  9. }
  10. return webSiteURLs;
  11. }

代码示例来源:origin: decaywood/XueQiuSuperSpider

  1. private void initMap() throws Exception {
  2. industryMap = new HashMap<>();
  3. String target = URLMapper.COMPREHENSIVE_PAGE.toString();
  4. String content = request(new URL(target));
  5. Document doc = Jsoup.parse(content);
  6. Elements element = doc.getElementsByClass("second-nav")
  7. .get(1).children()
  8. .get(3).children()
  9. .get(3).children()
  10. .select("a");
  11. StringBuilder builder = new StringBuilder();
  12. for (Element ele : element) {
  13. if (!ele.hasAttr("title") || !ele.hasAttr("href")) continue;
  14. builder.append(ele.attr("href"));
  15. industryMap.put(ele.attr("title"), new Industry(ele.attr("title"), builder.toString()));
  16. builder.delete(0, builder.length());
  17. }
  18. }

相关文章