小偷採集,有經驗的猿猿應該都會作,我藉此作一個回憶。html
2013年我就任盛大文學,當時因文學版權、流量等問題,作了一套監控系統,用來監控當時的創世、龍空、縱橫等等比較知名中文網站。對於監控,我還能夠自吹一下經驗滿滿。java
當下社會,正是監控系統興風做浪的大好時機,並且有利可圖。mysql
舉例說明一下:web
一、公共wifi,收集用戶的部分信息,地理座標,鏈接時長,搜索內容等等,經過數據分析給用戶貼上標籤[吃貨]、[美嬌娘]、[剁手黨] 等等,而後將信息打包販賣,基本能買到好幾塊一條吶。正則表達式
二、經過採集得到大量的優秀文章,將其精修後可做爲一些書籍的底稿來用,並且這些底稿能夠換錢滴。sql
三、搜索大網站的用戶信息,好比就搜索cnblogs的推薦博客博主,把他們的信息down下來,整理分析,貼上[淫才]標籤,打包販賣,至少得30塊一條吧。數據庫
四、若是能有幸經過一些高級酒店的網站,搜索到一些零碎的用戶信息,並將其整理、拼接、合成比較完整的用戶信息,這些皇冠級用戶信息拿到4S、售樓處販賣,怎麼着一條信息也得小一百吧。apache
...json
不用再寫下去了,監控能創造出鉅額利益。網絡
開始教作最簡單的小偷。固然這篇小偷偷的就是cnblogs,學會了可不要亂搞,搞壞了誰負責?
java編寫,須要引入綠色框的包 mysql-connector-java-5.1.13.jar,用來鏈接mysql數據庫,若是採集信息不入庫,則能夠不用下載此包。
jdk1.8.0_112
黃色框內是方法:
downhtml 下載HTML內容
downImages 下載圖片
GetDocument 下載博客日誌
GetList 下載精華區列表
InsertMysql 博客入庫
main 程序入口
藍色框內是正則匹配式和一些配置 信息:
contectPattern 正則匹配出內容
imgPattern 正則匹配出圖片
listPattern 正則匹配出精華區列表
localPath 下載的圖片本地存放路徑
pickFormat 精華區url Formatter
webSite 你的網站
主要結構
藍色框總剖析
// 精選列表Formatter private static String pickFormat = "https://www.cnblogs.com/pick/%s/"; //獲取圖片url的正則表達式 private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\""); //獲取精華列表的正則表達式 private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>"); //獲取博客內容的正則表達式 private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>"); //網站放置下載圖片的路徑 private static String webSite = "http://www.Website.com/loadimages/"; //本地下載圖片的路徑 private static String localPath = "D:\\WWW\\loadimages\\";
DownHtml剖析
/** * 下載html * * @param url * 博客URL * @return html的內容 */ static String downhtml(String url) throws UnsupportedEncodingException, IOException{ BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8")); StringBuffer stringBuffer = new StringBuffer(); String read = ""; while ((read = br.readLine()) != null) { if (stringBuffer.length() == 0) { stringBuffer.append(read); } else { stringBuffer.append("\r\n").append(read); } } br.close(); return stringBuffer.toString(); }
DownImages剖析
/** * 下載網絡圖片到本地 * * @param imgUrl * 圖片URL * @param imgName * 保存到本地名稱 */ static void downImages(String imgUrl, String imgName) { System.out.println("downfile --> " + imgName + "\t" + imgUrl); try { URL url = new URL(imgUrl); URLConnection conn = url.openConnection(); conn.setConnectTimeout(10000); InputStream inStream = conn.getInputStream(); FileOutputStream fs = new FileOutputStream(imgName); int byteread = 0; byte[] buffer = new byte[1204]; while ((byteread = inStream.read(buffer)) != -1) { fs.write(buffer, 0, byteread); } fs.close(); } catch (IOException e) { System.err.println(e.getStackTrace()); } }
GetList剖析
/** * 下載精華區列表 */ static void GetList() { for (int i = 1; i < 80; i++) { try { Thread.sleep(7777); String url = String.format(pickFormat, i); //獲得第一條是https://www.cnblogs.com/pick/1/ 精華區列表第一頁 String html = downhtml(url);//下載精華區 Matcher listMatcher = listPattern.matcher(html);//匹配精華區列表 while (listMatcher.find()) { String title = listMatcher.group("title");//博客標題 url = listMatcher.group("url");//博客URL System.out.println(title + "\t" + url); GetDocument(url, title);//下載博客內容 Thread.sleep(7777); } } catch (Exception e) { System.err.println(e.getStackTrace()); } } }
GetDocument剖析
/** * 下載博客日誌 */ static void GetDocument(String url, String title) { try { String html = downhtml(url);//下載博客日誌 System.out.println("html.length --> " + html.length()); Matcher contectMatcher = contectPattern.matcher(html);//匹配博客內容 if (contectMatcher.find()) { String content = contectMatcher.group("conect");//得到博客內容 //圖片的url去重複 HashMap<String, String> map = new HashMap<String, String>(); Matcher imgMatcher = imgPattern.matcher(content);//匹配博客圖片url while (imgMatcher.find()) { String matVal = imgMatcher.group(); String webUrl = matVal.substring(1, matVal.length() - 1); if (map.containsKey(webUrl)) continue; String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");//保存本地隨機生成圖片名 String fullName = localPath + fileName;//保存圖片的全路徑 String webFileName = webSite + fileName;//博客內容須要轉換的新圖片url downImages(webUrl, fullName);//下載圖片 map.put(webUrl, webFileName); try { Thread.sleep(333); } catch (InterruptedException e) { System.err.println(e.getStackTrace()); } } //將cnblogs的圖片url替換成預設網站的url for (Entry<String, String> entry : map.entrySet()) { content = content.replace(entry.getKey(), entry.getValue()); } System.out.println("Match content --> " + content.substring(0, 50)); InsertMysql(title, content);//入庫 } } catch (IOException e) { System.err.println(e.getStackTrace()); } }
InsertMysql剖析
//博客入庫 static void InsertMysql(String title, String content) { String url = "jdbc:mysql://localhost:3306/wordpress"; String user = "root"; String password = "root"; Connection connection = null; PreparedStatement preparedStatement = null; try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url, user, password); preparedStatement = connection .prepareStatement("INSERT INTO wordpress.wp_posts " + "(post_author, " + "post_date, " + "post_date_gmt, " + "post_content, " + "post_title, " + "post_excerpt, " + "post_status, " + "comment_status, " + "ping_status, " + "post_password, " + "post_name, " + "to_ping, " + "pinged, " + "post_modified, " + "post_modified_gmt, " + "post_content_filtered, " + "post_parent, " + "guid, " + "menu_order, " + "post_type, " + "post_mime_type, " + "comment_count" + ")" + "VALUES" + "(1, " + "now(), " + "now(), " + "?, " + "?, " + "'', " + "'publish', " + "'open', " + "'open', " + "'', " + "'', " + "'', " + "'', " + "now(), " + "now(), " + "'', " + "0, " + "'', " + "0, " + "'post', " + "'', " + "0" + ");"); preparedStatement.setString(1, content); preparedStatement.setString(2, title); System.out.println(preparedStatement.executeUpdate() + " " + title); } catch (Exception e) { System.err.println(e.getStackTrace()); } finally { if (preparedStatement != null) try { preparedStatement.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } if (connection != null) try { connection.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } } }
完整代碼(本人喜歡隨性亂塗,代碼基本無註釋,抱歉)
import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.util.HashMap; import java.util.Map.Entry; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; public class appCnblogsCollect { // 精選列表Formatter private static String pickFormat = "https://www.cnblogs.com/pick/%s/"; //獲取圖片url的正則表達式 private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\""); //獲取精華列表的正則表達式 private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>"); //獲取博客內容的正則表達式 private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>"); //網站放置下載圖片的路徑 private static String webSite = "http://www.Website.com/loadimages/"; //本地下載圖片的路徑 private static String localPath = "D:\\WWW\\loadimages\\"; public static void main(String[] args) throws Exception { GetList(); } /** * 下載精華區列表 */ static void GetList() { for (int i = 1; i < 80; i++) { try { Thread.sleep(7777); String url = String.format(pickFormat, i); //獲得第一條是https://www.cnblogs.com/pick/1/ 精華區列表第一頁 String html = downhtml(url);//下載精華區 Matcher listMatcher = listPattern.matcher(html); while (listMatcher.find()) { String title = listMatcher.group("title"); url = listMatcher.group("url"); System.out.println(title + "\t" + url); GetDocument(url, title); Thread.sleep(7777); } } catch (Exception e) { System.err.println(e.getStackTrace()); } } } /** * 下載博客日誌 */ static void GetDocument(String url, String title) { try { String html = downhtml(url); System.out.println("html.length --> " + html.length()); Matcher contectMatcher = contectPattern.matcher(html); if (contectMatcher.find()) { String content = contectMatcher.group("conect"); //圖片的url去重複 HashMap<String, String> map = new HashMap<String, String>(); Matcher imgMatcher = imgPattern.matcher(content); while (imgMatcher.find()) { String matVal = imgMatcher.group(); String webUrl = matVal.substring(1, matVal.length() - 1); if (map.containsKey(webUrl)) continue; String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp"); String fullName = localPath + fileName; String webFileName = webSite + fileName; downImages(webUrl, fullName); map.put(webUrl, webFileName); try { Thread.sleep(333); } catch (InterruptedException e) { System.err.println(e.getStackTrace()); } } //將cnblogs的圖片url替換成預設網站的url for (Entry<String, String> entry : map.entrySet()) { content = content.replace(entry.getKey(), entry.getValue()); } System.out.println("Match content --> " + content.substring(0, 50)); InsertMysql(title, content); } } catch (IOException e) { System.err.println(e.getStackTrace()); } } /** * 下載html * * @param url * 博客URL * @return html的內容 */ static String downhtml(String url) throws UnsupportedEncodingException, IOException{ BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8")); StringBuffer stringBuffer = new StringBuffer(); String read = ""; while ((read = br.readLine()) != null) { if (stringBuffer.length() == 0) { stringBuffer.append(read); } else { stringBuffer.append("\r\n").append(read); } } br.close(); return stringBuffer.toString(); } /** * 下載網絡圖片到本地 * * @param imgUrl * 圖片URL * @param imgName * 保存到本地名稱 */ static void downImages(String imgUrl, String imgName) { System.out.println("downfile --> " + imgName + "\t" + imgUrl); try { URL url = new URL(imgUrl); URLConnection conn = url.openConnection(); conn.setConnectTimeout(10000); InputStream inStream = conn.getInputStream(); FileOutputStream fs = new FileOutputStream(imgName); int byteread = 0; byte[] buffer = new byte[1204]; while ((byteread = inStream.read(buffer)) != -1) { fs.write(buffer, 0, byteread); } fs.close(); } catch (IOException e) { System.err.println(e.getStackTrace()); } } static void InsertMysql(String title, String content) { String url = "jdbc:mysql://localhost:3306/wordpress"; String user = "root"; String password = "root"; Connection connection = null; PreparedStatement preparedStatement = null; try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url, user, password); preparedStatement = connection .prepareStatement("INSERT INTO wordpress.wp_posts " + "(post_author, " + "post_date, " + "post_date_gmt, " + "post_content, " + "post_title, " + "post_excerpt, " + "post_status, " + "comment_status, " + "ping_status, " + "post_password, " + "post_name, " + "to_ping, " + "pinged, " + "post_modified, " + "post_modified_gmt, " + "post_content_filtered, " + "post_parent, " + "guid, " + "menu_order, " + "post_type, " + "post_mime_type, " + "comment_count" + ")" + "VALUES" + "(1, " + "now(), " + "now(), " + "?, " + "?, " + "'', " + "'publish', " + "'open', " + "'open', " + "'', " + "'', " + "'', " + "'', " + "now(), " + "now(), " + "'', " + "0, " + "'', " + "0, " + "'post', " + "'', " + "0" + ");"); preparedStatement.setString(1, content); preparedStatement.setString(2, title); System.out.println(preparedStatement.executeUpdate() + " " + title); } catch (Exception e) { System.err.println(e.getStackTrace()); } finally { if (preparedStatement != null) try { preparedStatement.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } if (connection != null) try { connection.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } } } }
以上是一個不須要用戶身份驗證的例子,在某些特殊的狀況下,須要身份驗證怎麼辦?不要着急,我下面還有一個例子,下面的例子是年底統計整年工做日報,並且這個工做日報還影響績效和考覈,但是,我一年沒填了,從頭寫是不可能了,只能作個小工具,因而就有了它,隨便亂彈的,看着醜就醜吧。
package ebooks; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.message.BasicNameValuePair; public class appTimeSheet { public static void main(String[] args) throws Exception { String username = "username"; String password = "yourpassword"; SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); String urlLogin = "http://10.54.11.37:8989/login"; String urlSave = "http://10.54.11.37:8989/agenda/submission/saveTimereg.json"; DefaultHttpClient client = new DefaultHttpClient(new PoolingClientConnectionManager()); HttpPost loginHttpPost = new HttpPost(urlLogin); List<NameValuePair> loginPairs = new ArrayList<NameValuePair>(); loginPairs.add(new BasicNameValuePair("username", username)); loginPairs.add(new BasicNameValuePair("password", password)); loginHttpPost.setEntity(new UrlEncodedFormEntity(loginPairs, "utf-8")); HttpResponse response = client.execute(loginHttpPost); System.out.println(response.getStatusLine()); Calendar calendar = Calendar.getInstance(); calendar.clear(); calendar.set(2017, 11, 24); long end = calendar.getTimeInMillis(); calendar.clear(); calendar.set(2017, 11, 4); for (; calendar.getTimeInMillis() < end; calendar.add(Calendar.DATE, 1)) { if (calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SUNDAY || calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SATURDAY) { continue; } HttpPost savePost = new HttpPost(urlSave); List<NameValuePair> savePairs = new ArrayList<NameValuePair>(); savePairs.add(new BasicNameValuePair("timereg", "監控室")); savePairs.add(new BasicNameValuePair("timereg", "研發項目")); savePairs.add(new BasicNameValuePair("timereg", "雲平臺智能服務技術的研究和應用")); savePairs.add(new BasicNameValuePair("timereg", "所有(共享類)")); switch (calendar.get(Calendar.DAY_OF_WEEK)) { case Calendar.MONDAY: savePairs.add(new BasicNameValuePair("timereg", "8,0,0,0,0,0,0")); System.out.println("星期一"); break; case Calendar.TUESDAY: savePairs.add(new BasicNameValuePair("timereg", "0,8,0,0,0,0,0")); System.out.println("星期二"); break; case Calendar.WEDNESDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,8,0,0,0,0")); System.out.println("星期三"); break; case Calendar.THURSDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,0,8,0,0,0")); System.out.println("星期四"); break; case Calendar.FRIDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,0,0,8,0,0")); System.out.println("星期五"); break; } savePairs.add(new BasicNameValuePair("timereg", "監控系統開發與應用")); savePairs.add(new BasicNameValuePair("timereg", "8")); savePairs .add(new BasicNameValuePair("theweek", Integer.toString(calendar.get(Calendar.WEEK_OF_YEAR) + 1))); System.out.println("第" + (calendar.get(Calendar.WEEK_OF_YEAR) + 1) + "周"); savePairs.add(new BasicNameValuePair("starttime", simpleDateFormat.format(calendar.getTime()))); System.out.println("starttime=" + simpleDateFormat.format(calendar.getTime())); savePost.setEntity(new UrlEncodedFormEntity(savePairs, "utf-8")); HttpResponse saveResponse = client.execute(savePost); System.out.println(saveResponse.getStatusLine()); System.out.println(); Thread.sleep(7777); } } }