項目須要在抓取新浪微博搜索結果數據,順手作了個工具,以實如今新浪微博搜索中自動抓取配置的關鍵字的搜索結果。在此分享一下。 css
先看一下新浪微博搜索結果頁面的源碼: html
能夠看到,獲得的並非普通html,都是經過js調用的。其中漢字所有是通過編碼的。文本元素所有都是上圖紅框中的格式,要獲得搜索結果就須要對紅框中的文本進行解析。其中用到了jsoup 和 fastjson jar包,須要自行下載。 java
jsoup: http://jsoup.org/download node
fastjson:http://sourceforge.net/projects/fastjson git
搜索結果抓取核心類: github
- import java.io.IOException;
- import java.text.ParseException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.solr.common.SolrInputDocument;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.safety.Whitelist;
- import org.jsoup.select.Elements;
- import com.alibaba.fastjson.JSON;
- public class WeiboFetcher extends AbstractFetcher {
- // 文本塊正文匹配正則
- private final String blockRegex = "<script>STK\\s&&\\sSTK\\.pageletM\\s&&\\sSTK\\.pageletM\\.view\\(.*\\)";
- private Pattern pattern = Pattern.compile(blockRegex);
- private static Whitelist whitelist = new Whitelist();
- static{
- // 只保留em標籤的文本
- whitelist.addTags("em");
- }
- @Override()
- public List<SolrInputDocument> fetch() {
- List<SolrInputDocument> newsResults = new ArrayList<SolrInputDocument>();
- newsResults = WeiboResult();
- System.out.println("WeiboFetcher Over: " + newsResults.size());
- return newsResults;
- }
- /**
- * 獲取關鍵字搜索結果
- * @return
- */
- private List<SolrInputDocument> WeiboResult() {
- String keyWord = null;
- List<SolrInputDocument> newsResultList = new ArrayList<SolrInputDocument>();
- // 獲取配置的關鍵字
- List<String> keyWordList = KeywordReader.getInstance().getKeywords();
- for (String keyWordLine : keyWordList) {
- // 轉換爲新浪微博搜索接受的格式
- keyWord = policy.getKeyWord(keyWordLine,null);
- newsResultList.addAll(getWeiboContent(keyWord));
- }
- return newsResultList;
- }
- /**
- * 獲取搜索結果
- * @param keyWord
- * @return
- */
- private List<SolrInputDocument> getWeiboContent(String keyWord){
- System.out.println("fetch keyword: " + keyWord);
- List<SolrInputDocument> resultList = new ArrayList<SolrInputDocument>();
- for(int i = 0; i < depth; i++){
- String page = "";
- if(i > 0){
- page = "&page=" + (i+1);
- }
- //抓取返回50個內容
- try {
- System.out.println("fetch url page depth " + (i + 1));
- // 注意&nodup=1
- Document doc = Jsoup.connect(
- "http://s.weibo.com/weibo/" + keyWord+"&nodup=1" + page).get();
- String source = doc.html();
- // 匹配文本塊
- Matcher m = pattern.matcher(source);
- while(m.find()){
- String jsonStr = m.group();
- jsonStr = jsonStr.substring(jsonStr.indexOf("{"), jsonStr.lastIndexOf(")"));
- // 解析json,轉換爲實體類
- WeiboBlock block = JSON.parseObject(jsonStr, WeiboBlock.class);
- if(block.getHtml().trim().startsWith("<div class=\"search_feed\">")){
- doc = Jsoup.parse(block.getHtml());
- }
- }
- List<Element> elements = getAllElement(doc);
- if(elements == null || elements.size() == 0){
- System.out.println("No more urls to fetch with current keyword." );
- return resultList;
- }
- for (Element elem : elements) {
- String url = elem.select(".date").last().attr("href");
- String dateS = elem.select(".date").last().attr("date");
- String content = null;
- Date date = null;
- String content_text = null;
- String title = null;
- if (!isCrawledUrl(url)){
- if (url != null) {
- if (dateS != null && !"".equals(dateS)) {
- try {
- date = sdf.parse(changeString2Date(dateS));
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }
- if (date != null) {
- elem.getElementsByClass("info W_linkb W_textb").remove();
- content = Jsoup.clean(Jsoup.clean(elem.select(".content").html(), whitelist), Whitelist.none());
- title = this.parseTitle(content);
- url = elem.select(".date").last().attr("href");
- SolrInputDocument sid = buildSolrInputDocumentList(url, content, title, date);
- if (sid != null && sid.size() > 0) {
- resultList.add(sid);
- }
- }
- }else {
- System.out.println("current Url: ---------null------------" );
- }
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return resultList;
- }
- /**
- * 獲取全部的結果正文節點
- * @param doc
- * @return
- */
- private List<Element> getAllElement(Document doc) {
- List<Element> resultList = new ArrayList<Element>();
- Elements elems = doc.select(".search_feed .feed_list");
- for (Element element : elems) {
- resultList.add(element);
- }
- return resultList;
- }
- @Override
- protected boolean isCrawledUrl(String url) {
- return isAvaliableUrl(url);
- }
- /**
- * 生成標題
- * @param htmlContent
- * @return
- */
- private String parseTitle(String htmlContent) {
- if (htmlContent == null || htmlContent.trim().equals(""))
- return null;
- String title = htmlContent;
- title = title.trim();
- for (int i = 0; i < title.length(); i++) {
- if (String.valueOf((title.charAt(i))).matches("[,.\\?\\!\\.,]")) {
- title = title.substring(0, i);
- break;
- }
- }
- return title;
- }
- }
結果實體類: apache
- public class WeiboBlock{
- private String pid;
- private String js;
- private String css;
- private String html;
- public WeiboBlock(){}
- public String getPid() {
- return pid;
- }
- public void setPid(String pid) {
- this.pid = pid;
- }
- public String getJs() {
- return js;
- }
- public void setJs(String js) {
- this.js = js;
- }
- public String getCss() {
- return css;
- }
- public void setCss(String css) {
- this.css = css;
- }
- public String getHtml() {
- return html;
- }
- public void setHtml(String html) {
- this.html = html;
- }
- }
關鍵字生成策略類: json
- public class SinaKeyWordsPolicy implements KeyWordsPolicy {
- @Override
- public String getKeyWord(String keyWordLine, String siteLine) {
- String keyWord = null;
- keyWordLine = keyWordLine.replaceAll("\"", "");
- keyWordLine = keyWordLine.replaceAll("AND", " ");
- keyWordLine = keyWordLine.replaceAll("OR", "|");
- if (keyWordLine.contains("|")) {
- String[] tempStrings = keyWordLine.split("|");
- if (tempStrings.length > 3) {
- for (int i=0; i<3 ;i++) {
- keyWord += tempStrings[i];
- keyWord += "|";
- }
- }else {
- keyWord = keyWordLine;
- }
- }else {
- keyWord = keyWordLine;
- }
- return java.net.URLEncoder.encode(java.net.URLEncoder.encode(keyWord));
- }
- }
關鍵字配置文件使用文本文件便可,每行一組關鍵字,格式相似以下: ide
"key1" 工具
"key1"AND"key2"
"key1"AND("key2"OR"key3")
附:項目源碼已經整理並上傳GitHub, 訪問地址:https://github.com/Siriuser/WeiboCrawler 須要源碼的話請自行下載。