jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套很是省力的API,可經過DOM,CSS以及相似於jQuery的操做方法來取出和操做數據。html
jsoup的主要功能以下:java
1. 從一個URL,文件或字符串中解析HTML;node
2. 使用DOM或CSS選擇器來查找、取出數據;app
3. 可操做HTML元素、屬性、文本;dom
jsoup是基於MIT協議發佈的,可放心使用於商業項目ide
主程序爲GrapNews類,實現了從汽車網摘取相關內容的功能。GrapNews有main函數,執行便可。函數
package net.sinolbs.ycd.news; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * TODO * 2017年5月21日上午12:25:30 */ public class GrapNews { public static boolean isContainChinese(String str) { Pattern p = Pattern.compile("[\u4e00-\u9fa5]"); Matcher m = p.matcher(str); if (m.find()) { return true; } return false; } /** * 從笑話集抓取笑話 * @param size * @param baseUrl * @param domainName * @param newsListClassOrId * @param classOrId * @param newsULIndex * @param newsContentClassOrId * @param titleTagOrClass * @param dateTag * @return */ public static ArrayList<News> getNewsFromJokeji(int size,String baseUrl,String domainName, String newsListClassOrId,int newsULIndex, String newsContentClassOrId,String titleTagOrClass,String dateTag){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Element element =null; Element title =null; News news = null; try { doc = Jsoup.connect(baseUrl).timeout(10000).get(); element = (Element) doc.getElementsByClass(newsListClassOrId).first(); Elements elements = element.getElementsByTag("li"); if(elements!=null&&elements.size()>0){ for(Element ele:elements){ news = new News(); title = ele.select("a").first(); if(title==null){ continue; } news.setTitle(title.getElementsByTag(titleTagOrClass).text()); if(news.getTitle()==null||news.getTitle().equals("")){ continue; } news.setHref(domainName+title.attr("href")); if(dateTag!=null){ String date=ele.select("i").text(); news.setDate(date); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8") .toLowerCase().replace("%3a", ":").replace("%2f", "/"); } Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get(); String text=newsDoc.getElementById(newsContentClassOrId).html(); text=deleteImg(text); text=deleteA(text); StringBuffer textBuffer=new StringBuffer(5); textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">"); textBuffer.append("</head><body>"); textBuffer.append(deleteSource(text)); textBuffer.append("</body></html>"); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println("標題====="+news.getTitle()); System.out.println("href====="+news.getHref()); System.out.println("content====="+news.getContent()); newsList.add(news); if(newsList.size()==size){ break; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } /** * 從汽車之家抓新聞 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId, String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Elements elements =null; Element title =null; News news = null; try { doc = Jsoup.connect(baseUrl).timeout(10000).get(); elements = (Elements) doc.getElementById(newsListId).children(); if(elements!=null&&elements.size()>0){ for(Element ele:elements){ news = new News(); title = ele.select("a").first(); if(title==null){ continue; } news.setTitle(title.getElementsByTag(titleTag).text()); if(news.getTitle()==null||news.getTitle().equals("")){ continue; } news.setHref(domainName+title.attr("href")); if(dateTag!=null){ String date=ele.select("i").text(); news.setDate(date); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8") .toLowerCase().replace("%3a", ":").replace("%2f", "/"); } Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get(); String text=newsDoc.getElementsByClass(newsContentClass).html(); if(text.indexOf("餘下全文")>0||text.indexOf("未經許可")>0 ||text.indexOf("禁止轉載")>0||text.indexOf("公衆號")>0||text.indexOf("公衆帳號")>0){ continue; } text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt); int index=text.lastIndexOf("("); if(index>0){ text=text.substring(0,index); } StringBuffer textBuffer=new StringBuffer(5); textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">"); textBuffer.append("</head><body>"); textBuffer.append(deleteSource(text)); textBuffer.append("</body></html>"); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println("標題====="+news.getTitle()); System.out.println("href====="+news.getHref()); System.out.println("content====="+news.getContent()); newsList.add(news); if(newsList.size()==size){ break; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } public static String getVideoFromMiaoPai(String baseUrl) throws Exception{ Document doc= Jsoup.connect(baseUrl).timeout(10000).get(); String html=doc.html().trim(); return getUrlFromMiaoPaiHtml(html); } public static String getUrlFromMiaoPaiHtml(String html){ int startIndex=html.indexOf("videoSrc"); int endIndex=html.indexOf("poster"); String videoUrl=html.substring(startIndex+11,endIndex+5); int index=videoUrl.indexOf('"'); if(index>0){ return videoUrl.substring(0, index); } return videoUrl; } public static String getVideoPhotoFromMiaoPaiHtml(String html){ System.out.println(html); int startIndex=html.indexOf("poster"); int index=html.substring(startIndex).indexOf("jpg"); return html.substring(startIndex+9,startIndex+index+3); } public static void main(String[] args) throws Exception{ getNewsFromCarHome(2,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽車之家"); getNewsFromJokeji(3,"http://www.jokeji.cn/list.htm","http://www.jokeji.cn","list_title",1,"text110","a","i"); getNewsFromSouHu(20,"http://m.sohu.com/c/1592/","a",null,null); } /** * 從秒拍抓視頻 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getVideoFromMiaopai(int size,String baseUrl){ ArrayList<News> newsList = new ArrayList<News>(); try { News news = null; Element videoEmement=null; Document doc = null; String videoUrl=null; doc = Jsoup.connect(baseUrl).timeout(10000).get(); Elements elements = doc.getElementsByClass("videoCont"); String videoDetailUrl=""; if(elements!=null&&elements.size()>0){ for(Element ele:elements){ videoEmement=ele.getElementsByClass("MIAOPAI_player").first(); String videoId=videoEmement.attr("data-scid").toString(); String videoPhotoUrl=videoEmement.attr("data-img").toString(); String videoTitle=ele.getElementsByClass("viedoAbout").first().getElementsByTag("p").text(); System.out.println("視頻id"+videoId); System.out.println("視頻封面url"+videoPhotoUrl); System.out.println("視頻標題"+videoTitle); news = new News(); if(videoId!=null){ news.setTitle(videoTitle); videoDetailUrl="http://www.miaopai.com/show/"+videoId+".html"; doc = Jsoup.connect("http://www.miaopai.com/show/"+videoId+".html").timeout(10000).get(); System.out.println("視頻詳情url========"+videoDetailUrl); news.setHref("http://m.miaopai.com/show/"+videoId); news.setPhotoUrl(videoPhotoUrl); } if(doc!=null){ videoUrl=getUrlFromMiaoPaiHtml(doc.html()); } if(videoUrl!=null){ news.setContent(createVideoHtml(videoUrl, videoPhotoUrl)); System.out.println("視頻url====="+videoUrl); System.out.println("視頻html======"+news.getContent()); newsList.add(news); } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } public static String createVideoHtml(String videoUrl,String videoPhotoUrl) { Document doc; StringBuffer textBuffer = new StringBuffer(5); textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">"); textBuffer.append("</head><body>"); textBuffer.append("<div align=\"center\">"); textBuffer.append(" <video></video> </div>"); textBuffer.append("</body></html>"); doc = Jsoup.parse(textBuffer.toString()); doc.getElementsByTag("body").attr("style", "height:400px;"); doc.getElementsByTag("video").attr("style", "width:100%;max-height:400px;") .attr("poster", videoPhotoUrl).attr("autoplay", "autoplay") .attr("controls", "controls").attr("src", videoUrl); return doc.toString(); } /** * 從搜狐抓新聞 * @param size * @param baseUrl * @param domainName * @param newsListId * @param newsContentClass * @param titleTagOrClass * @param limitHref * @param dateTag * @param needDeleteAlt * @return */ public static ArrayList<News> getNewsFromSouHu(int size,String baseUrl, String titleTag,String dateTag,String needDeleteAlt){ ArrayList<News> newsList = new ArrayList<News>(); Document doc; Element element =null; Element title =null; News news = null; try { doc = Jsoup.connect(baseUrl).timeout(10000).get(); element =doc.getElementsByTag("section").get(2); element = element.getElementsByClass("headlines").get(0); Elements elements=element.children(); if(elements!=null&&elements.size()>0){ for(Element ele:elements){ news = new News(); title = ele.select("a").first(); if(title==null){ continue; } news.setTitle(title.getElementsByTag(titleTag).text()); if(news.getTitle()==null||news.getTitle().equals("") ||news.getTitle().indexOf("廣告")>0||news.getTitle().indexOf("視頻")>0){ continue; } news.setHref("https://m.sohu.com"+title.attr("href")); if(dateTag!=null){ String dateStr=ele.select(dateTag).first().text(); news.setDate(dateStr); } String newsUrl =news.getHref(); if (isContainChinese(news.getHref())) { newsUrl = URLEncoder.encode(news.getHref(), "utf-8") .toLowerCase().replace("%3a", ":").replace("%2f", "/"); } Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get(); String text=newsDoc.getElementsByTag("article").html(); if(text.indexOf("未經許可")>0||text.indexOf("禁止轉載")>0 ||text.indexOf("公衆號")>0||text.indexOf("公衆帳號")>0){ continue; } int index=text.indexOf("<p class=\"para\">"); int lastIndex=text.indexOf("<div class=\"expend-wp\"> "); if(lastIndex>0){ text=text.substring(index,lastIndex); }else if(index>0){ text=text.substring(index,text.length()); } text=replaceImgSrcFromDataSrc(text,true,null); if(text==null||text.length()==0){ continue; } StringBuffer textBuffer=new StringBuffer(5); textBuffer.append("<!DOCTYPE html><html><head>" + "<meta name=\"content-type\" content=\"text/html; charset=UTF-8\">"); textBuffer.append("</head><body>"); textBuffer.append(deleteSource(text)); textBuffer.append("</body></html>"); news.setContent(textBuffer.toString()); news.setContent(textBuffer.toString()); System.out.println("標題====="+news.getTitle()); System.out.println("href====="+news.getHref()); System.out.println("content====="+news.getContent()); newsList.add(news); if(newsList.size()==size){ break; } } } } catch (Exception e) { e.printStackTrace(); } return newsList; } private static String deleteImg(String text) { return text.replaceAll("<img [^>]*>", ""); } private static String deleteA(String text) { return text.replaceAll("<a[^>]*>(.*?)</a>", ""); } private static String deleteSource(String text) { return text.replaceAll("\\(.*?\\)|\\[.*?]", ""); } /** * 刪除a標籤中的href * @param content * @return */ public static String removeHref(String content){ Document document = Jsoup.parse(content); Elements elements = document.select("a[href]"); for(Element el:elements){ el.removeAttr("href"); } return document.html(); } /** * 將htmlBody中全部img標籤中的src內容替換爲原data-src的內容, <br/> * 若是不報含data-src,則src的內容不會被替換 <br/> * @param htmlBody html內容 * @param needDeleteAlt 須要剔除的圖片的alt信息 * @param imgUrlNeedAddProtocolPrefix 圖片的url是否須要添加http協議前綴 * @return 返回替換後的內容 */ public static String replaceImgSrcFromDataSrc(String htmlBody, boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) { Document document = Jsoup.parseBodyFragment(htmlBody); List<Element> nodes = document.select("img"); int nodeLenth = nodes.size(); if(nodeLenth==0){ return htmlBody; } for (int i = 0; i < nodeLenth; i++) { Element e = nodes.get(i); String dataSrc = e.attr("data-src"); if (isNotBlank(dataSrc)) { e.attr("src", dataSrc); e.removeAttr("data-src"); } String originalSrc = e.attr("original"); if (isNotBlank(originalSrc)) { e.attr("src", "http:"+originalSrc); e.removeAttr("originalSrc"); } String originalHiddenSrc = e.attr("original-hidden"); if (isNotBlank(originalHiddenSrc)) { e.attr("src", "http:"+originalHiddenSrc); e.removeAttr("original-hidden"); } } if (htmlBody.contains("<html>")) { if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){ return document.toString(); }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){ return document.toString().replace("src=\"//", "src=\"http://"); }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){ return document.toString().replace("src=\"//", "src=\"http://") .replace("alt="+needDeleteAlt, ""); } return document.toString().replace("alt="+needDeleteAlt, ""); } else { if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){ return document.select("body>*").toString(); }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){ return document.select("body>*").toString().replace("src=\"//", "src=\"http://"); }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){ return document.select("body>*"). toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, ""); } return document.select("body>*").toString().replace("alt="+needDeleteAlt, ""); } } private static boolean isNotBlank(String str){ if(str == null) return false; else if(str.trim().length() == 0) return false; else return true; } }
還有一個載體類,用於把趴下來的網頁內容進行封裝到一個類裏面。post
package net.sinolbs.ycd.news; /** * 新聞數據載體 */ public class News { private int id; private String title; private String href; private String content; private String date; private String photoUrl; public News() { } public News(String title, String href, String content, int id) { this.title = title; this.content = content; this.href = href; this.id = id; } public int getId() { return id; } public void setId(int id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getHref() { return href; } public void setHref(String href) { this.href = href; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getPhotoUrl() { return photoUrl; } public void setPhotoUrl(String photoUrl) { this.photoUrl = photoUrl; } }
運行GrapNews類(有main方法)。this