手把手教作小偷採集

  小偷採集,有經驗的猿猿應該都會作,我藉此作一個回憶。html

  2013年我就任盛大文學,當時因文學版權、流量等問題,作了一套監控系統,用來監控當時的創世、龍空、縱橫等等比較知名中文網站。對於監控,我還能夠自吹一下經驗滿滿。java

  當下社會,正是監控系統興風做浪的大好時機,並且有利可圖。mysql

  舉例說明一下:web

  一、公共wifi,收集用戶的部分信息,地理座標,鏈接時長,搜索內容等等,經過數據分析給用戶貼上標籤[吃貨]、[美嬌娘]、[剁手黨] 等等,而後將信息打包販賣,基本能買到好幾塊一條吶。正則表達式

  二、經過採集得到大量的優秀文章,將其精修後可做爲一些書籍的底稿來用,並且這些底稿能夠換錢滴。sql

  三、搜索大網站的用戶信息,好比就搜索cnblogs的推薦博客博主,把他們的信息down下來,整理分析,貼上[淫才]標籤,打包販賣,至少得30塊一條吧。數據庫

  四、若是能有幸經過一些高級酒店的網站,搜索到一些零碎的用戶信息,並將其整理、拼接、合成比較完整的用戶信息,這些皇冠級用戶信息拿到4S、售樓處販賣,怎麼着一條信息也得小一百吧。apache

...json

  不用再寫下去了,監控能創造出鉅額利益。網絡

  開始教作最簡單的小偷。固然這篇小偷偷的就是cnblogs,學會了可不要亂搞,搞壞了誰負責?

java編寫,須要引入綠色框的包 mysql-connector-java-5.1.13.jar,用來鏈接mysql數據庫,若是採集信息不入庫,則能夠不用下載此包。
jdk1.8.0_112

黃色框內是方法:

downhtml 下載HTML內容

downImages 下載圖片

GetDocument 下載博客日誌

GetList 下載精華區列表

InsertMysql 博客入庫

main 程序入口

 

藍色框內是正則匹配式和一些配置 信息:

contectPattern 正則匹配出內容

imgPattern 正則匹配出圖片

listPattern 正則匹配出精華區列表

localPath 下載的圖片本地存放路徑

pickFormat 精華區url Formatter

webSite 你的網站

 

主要結構

藍色框總剖析

    // 精選列表Formatter
    private static String pickFormat = "https://www.cnblogs.com/pick/%s/";
    
    //獲取圖片url的正則表達式
    private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\"");
    
    //獲取精華列表的正則表達式
    private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>");

    //獲取博客內容的正則表達式
    private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>");

    //網站放置下載圖片的路徑
    private static String webSite = "http://www.Website.com/loadimages/";
    
    //本地下載圖片的路徑
    private static String localPath = "D:\\WWW\\loadimages\\";

 

DownHtml剖析

    /**
     * 下載html
     *
     * @param  url
     *         博客URL
     * @return  html的內容
     */
    static String downhtml(String url) throws UnsupportedEncodingException, IOException{
        BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String read = "";
        while ((read = br.readLine()) != null) {
            if (stringBuffer.length() == 0) {
                stringBuffer.append(read);
            } else {
                stringBuffer.append("\r\n").append(read);
            }
        }
        br.close();
        return stringBuffer.toString();
    }

 

DownImages剖析

    /**
     * 下載網絡圖片到本地
     *
     * @param  imgUrl
     *         圖片URL
     * @param  imgName
     *         保存到本地名稱
     */
    static void downImages(String imgUrl, String imgName) {
        System.out.println("downfile --> " + imgName + "\t" + imgUrl);
        try {
            URL url = new URL(imgUrl);
            URLConnection conn = url.openConnection();
            conn.setConnectTimeout(10000);
            InputStream inStream = conn.getInputStream();
            FileOutputStream fs = new FileOutputStream(imgName);
            int byteread = 0;
            byte[] buffer = new byte[1204];
            while ((byteread = inStream.read(buffer)) != -1) {
                fs.write(buffer, 0, byteread);
            }
            fs.close();
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

 

GetList剖析

    /**
     * 下載精華區列表
     */
    static void GetList() {
        for (int i = 1; i < 80; i++) {
            try {
                Thread.sleep(7777);
                String url = String.format(pickFormat, i);  //獲得第一條是https://www.cnblogs.com/pick/1/  精華區列表第一頁
                String html = downhtml(url);//下載精華區
                Matcher listMatcher = listPattern.matcher(html);//匹配精華區列表
                while (listMatcher.find()) {
                    String title = listMatcher.group("title");//博客標題
                    url = listMatcher.group("url");//博客URL
                    System.out.println(title + "\t" + url);
                    GetDocument(url, title);//下載博客內容
                    Thread.sleep(7777);
                }

            } catch (Exception e) {
                System.err.println(e.getStackTrace());
            }
        }
    }


GetDocument剖析

    /**
     * 下載博客日誌
     */
    static void GetDocument(String url, String title) {
        try {
            String html = downhtml(url);//下載博客日誌
            System.out.println("html.length --> " + html.length());
            Matcher contectMatcher = contectPattern.matcher(html);//匹配博客內容
            if (contectMatcher.find()) {
                String content = contectMatcher.group("conect");//得到博客內容
                
                //圖片的url去重複
                HashMap<String, String> map = new HashMap<String, String>();
                Matcher imgMatcher = imgPattern.matcher(content);//匹配博客圖片url

                while (imgMatcher.find()) {
                    String matVal = imgMatcher.group();
                    String webUrl = matVal.substring(1, matVal.length() - 1);
                    if (map.containsKey(webUrl))
                        continue;

                    String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");//保存本地隨機生成圖片名
                    String fullName = localPath + fileName;//保存圖片的全路徑
                    String webFileName = webSite + fileName;//博客內容須要轉換的新圖片url
                    downImages(webUrl, fullName);//下載圖片
                    map.put(webUrl, webFileName);
                    try {
                        Thread.sleep(333);
                    } catch (InterruptedException e) {
                        System.err.println(e.getStackTrace());
                    }
                }

                //將cnblogs的圖片url替換成預設網站的url
                for (Entry<String, String> entry : map.entrySet()) {
                    content = content.replace(entry.getKey(), entry.getValue());
                }

                System.out.println("Match content --> " + content.substring(0, 50));
                
                InsertMysql(title, content);//入庫
            }
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

 

InsertMysql剖析

        //博客入庫
    static void InsertMysql(String title, String content) {
        String url = "jdbc:mysql://localhost:3306/wordpress";
        String user = "root";
        String password = "root";

        Connection connection = null;
        PreparedStatement preparedStatement = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url, user, password);
            preparedStatement = connection
                    .prepareStatement("INSERT INTO wordpress.wp_posts "
                            + "(post_author, " + "post_date, "
                            + "post_date_gmt, " + "post_content, "
                            + "post_title, " + "post_excerpt, "
                            + "post_status, " + "comment_status, "
                            + "ping_status, " + "post_password, "
                            + "post_name, " + "to_ping, " + "pinged, "
                            + "post_modified, " + "post_modified_gmt, "
                            + "post_content_filtered, " + "post_parent, "
                            + "guid, " + "menu_order, " + "post_type, "
                            + "post_mime_type, " + "comment_count" + ")"
                            + "VALUES" + "(1, " + "now(), " + "now(), " + "?, "
                            + "?, " + "'', " + "'publish', " + "'open', "
                            + "'open', " + "'', " + "'', " + "'', " + "'', "
                            + "now(), " + "now(), " + "'', " + "0, " + "'', "
                            + "0, " + "'post', " + "'', " + "0" + ");");

            preparedStatement.setString(1, content);
            preparedStatement.setString(2, title);
            System.out.println(preparedStatement.executeUpdate() + " " + title);
        } catch (Exception e) {
            System.err.println(e.getStackTrace());
        } finally {
            if (preparedStatement != null)
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
            if (connection != null)
                try {
                    connection.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
        }
    } 

 

完整代碼(本人喜歡隨性亂塗,代碼基本無註釋,抱歉)

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;

public class appCnblogsCollect {
    
    // 精選列表Formatter
    private static String pickFormat = "https://www.cnblogs.com/pick/%s/";
    
    //獲取圖片url的正則表達式
    private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\"");
    
    //獲取精華列表的正則表達式
    private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>");

    //獲取博客內容的正則表達式
    private static Pattern contectPattern = Pattern.compile("<div id=\"cnblogs_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>");

    //網站放置下載圖片的路徑
    private static String webSite = "http://www.Website.com/loadimages/";
    
    //本地下載圖片的路徑
    private static String localPath = "D:\\WWW\\loadimages\\";
    
    public static void main(String[] args) throws Exception {
        GetList();
    }

    /**
     * 下載精華區列表
     */
    static void GetList() {
        for (int i = 1; i < 80; i++) {
            try {
                Thread.sleep(7777);
                String url = String.format(pickFormat, i);  //獲得第一條是https://www.cnblogs.com/pick/1/  精華區列表第一頁
                String html = downhtml(url);//下載精華區
                Matcher listMatcher = listPattern.matcher(html);
                while (listMatcher.find()) {
                    String title = listMatcher.group("title");
                    url = listMatcher.group("url");
                    System.out.println(title + "\t" + url);
                    GetDocument(url, title);
                    Thread.sleep(7777);
                }

            } catch (Exception e) {
                System.err.println(e.getStackTrace());
            }
        }
    }
    
    /**
     * 下載博客日誌
     */
    static void GetDocument(String url, String title) {
        try {
            String html = downhtml(url);
            System.out.println("html.length --> " + html.length());
            Matcher contectMatcher = contectPattern.matcher(html);
            if (contectMatcher.find()) {
                String content = contectMatcher.group("conect");
                
                //圖片的url去重複
                HashMap<String, String> map = new HashMap<String, String>();
                Matcher imgMatcher = imgPattern.matcher(content);

                while (imgMatcher.find()) {
                    String matVal = imgMatcher.group();
                    String webUrl = matVal.substring(1, matVal.length() - 1);
                    if (map.containsKey(webUrl))
                        continue;

                    String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");
                    String fullName = localPath + fileName;
                    String webFileName = webSite + fileName;
                    downImages(webUrl, fullName);
                    map.put(webUrl, webFileName);
                    try {
                        Thread.sleep(333);
                    } catch (InterruptedException e) {
                        System.err.println(e.getStackTrace());
                    }
                }

                //將cnblogs的圖片url替換成預設網站的url
                for (Entry<String, String> entry : map.entrySet()) {
                    content = content.replace(entry.getKey(), entry.getValue());
                }

                System.out.println("Match content --> " + content.substring(0, 50));
                
                InsertMysql(title, content);
            }
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }
    
    /**
     * 下載html
     *
     * @param  url
     *         博客URL
     * @return  html的內容
     */
    static String downhtml(String url) throws UnsupportedEncodingException, IOException{
        BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String read = "";
        while ((read = br.readLine()) != null) {
            if (stringBuffer.length() == 0) {
                stringBuffer.append(read);
            } else {
                stringBuffer.append("\r\n").append(read);
            }
        }
        br.close();
        return stringBuffer.toString();
    }
    
    /**
     * 下載網絡圖片到本地
     *
     * @param  imgUrl
     *         圖片URL
     * @param  imgName
     *         保存到本地名稱
     */
    static void downImages(String imgUrl, String imgName) {
        System.out.println("downfile --> " + imgName + "\t" + imgUrl);
        try {
            URL url = new URL(imgUrl);
            URLConnection conn = url.openConnection();
            conn.setConnectTimeout(10000);
            InputStream inStream = conn.getInputStream();
            FileOutputStream fs = new FileOutputStream(imgName);
            int byteread = 0;
            byte[] buffer = new byte[1204];
            while ((byteread = inStream.read(buffer)) != -1) {
                fs.write(buffer, 0, byteread);
            }
            fs.close();
        } catch (IOException e) {
            System.err.println(e.getStackTrace());
        }
    }

    static void InsertMysql(String title, String content) {
        String url = "jdbc:mysql://localhost:3306/wordpress";
        String user = "root";
        String password = "root";

        Connection connection = null;
        PreparedStatement preparedStatement = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url, user, password);
            preparedStatement = connection
                    .prepareStatement("INSERT INTO wordpress.wp_posts "
                            + "(post_author, " + "post_date, "
                            + "post_date_gmt, " + "post_content, "
                            + "post_title, " + "post_excerpt, "
                            + "post_status, " + "comment_status, "
                            + "ping_status, " + "post_password, "
                            + "post_name, " + "to_ping, " + "pinged, "
                            + "post_modified, " + "post_modified_gmt, "
                            + "post_content_filtered, " + "post_parent, "
                            + "guid, " + "menu_order, " + "post_type, "
                            + "post_mime_type, " + "comment_count" + ")"
                            + "VALUES" + "(1, " + "now(), " + "now(), " + "?, "
                            + "?, " + "'', " + "'publish', " + "'open', "
                            + "'open', " + "'', " + "'', " + "'', " + "'', "
                            + "now(), " + "now(), " + "'', " + "0, " + "'', "
                            + "0, " + "'post', " + "'', " + "0" + ");");

            preparedStatement.setString(1, content);
            preparedStatement.setString(2, title);
            System.out.println(preparedStatement.executeUpdate() + " " + title);
        } catch (Exception e) {
            System.err.println(e.getStackTrace());
        } finally {
            if (preparedStatement != null)
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
            if (connection != null)
                try {
                    connection.close();
                } catch (SQLException e) {
                    System.err.println(e.getStackTrace());
                }
        }
    }
}
完整代碼


以上是一個不須要用戶身份驗證的例子,在某些特殊的狀況下,須要身份驗證怎麼辦?不要着急,我下面還有一個例子,下面的例子是年底統計整年工做日報,並且這個工做日報還影響績效和考覈,但是,我一年沒填了,從頭寫是不可能了,只能作個小工具,因而就有了它,隨便亂彈的,看着醜就醜吧。

package ebooks;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;

import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;

public class appTimeSheet {
    public static void main(String[] args) throws Exception {
        String username = "username";
        String password = "yourpassword";
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
        String urlLogin = "http://10.54.11.37:8989/login";
        String urlSave = "http://10.54.11.37:8989/agenda/submission/saveTimereg.json";
        DefaultHttpClient client = new DefaultHttpClient(new PoolingClientConnectionManager());
        HttpPost loginHttpPost = new HttpPost(urlLogin);
        List<NameValuePair> loginPairs = new ArrayList<NameValuePair>();
        loginPairs.add(new BasicNameValuePair("username", username));
        loginPairs.add(new BasicNameValuePair("password", password));
        loginHttpPost.setEntity(new UrlEncodedFormEntity(loginPairs, "utf-8"));
        HttpResponse response = client.execute(loginHttpPost);
        System.out.println(response.getStatusLine());

        Calendar calendar = Calendar.getInstance();
        calendar.clear();
        calendar.set(2017, 11, 24);
        long end = calendar.getTimeInMillis();
        calendar.clear();
        calendar.set(2017, 11, 4);
        
        for (; calendar.getTimeInMillis() < end; calendar.add(Calendar.DATE, 1)) {
            if (calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SUNDAY
                    || calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SATURDAY) {
                continue;
            }

            HttpPost savePost = new HttpPost(urlSave);
            List<NameValuePair> savePairs = new ArrayList<NameValuePair>();
            savePairs.add(new BasicNameValuePair("timereg", "監控室"));
            savePairs.add(new BasicNameValuePair("timereg", "研發項目"));
            savePairs.add(new BasicNameValuePair("timereg", "雲平臺智能服務技術的研究和應用"));
            savePairs.add(new BasicNameValuePair("timereg", "所有(共享類)"));

            switch (calendar.get(Calendar.DAY_OF_WEEK)) {
            case Calendar.MONDAY:
                savePairs.add(new BasicNameValuePair("timereg", "8,0,0,0,0,0,0"));
                System.out.println("星期一");
                break;
            case Calendar.TUESDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,8,0,0,0,0,0"));
                System.out.println("星期二");
                break;
            case Calendar.WEDNESDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,8,0,0,0,0"));
                System.out.println("星期三");
                break;
            case Calendar.THURSDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,0,8,0,0,0"));
                System.out.println("星期四");
                break;
            case Calendar.FRIDAY:
                savePairs.add(new BasicNameValuePair("timereg", "0,0,0,0,8,0,0"));
                System.out.println("星期五");
                break;
            }
            savePairs.add(new BasicNameValuePair("timereg", "監控系統開發與應用"));
            savePairs.add(new BasicNameValuePair("timereg", "8"));
            savePairs
                    .add(new BasicNameValuePair("theweek", Integer.toString(calendar.get(Calendar.WEEK_OF_YEAR) + 1)));
            System.out.println("第" + (calendar.get(Calendar.WEEK_OF_YEAR) + 1) + "周");

            savePairs.add(new BasicNameValuePair("starttime", simpleDateFormat.format(calendar.getTime())));
            System.out.println("starttime=" + simpleDateFormat.format(calendar.getTime()));

            savePost.setEntity(new UrlEncodedFormEntity(savePairs, "utf-8"));
            HttpResponse saveResponse = client.execute(savePost);
            System.out.println(saveResponse.getStatusLine());
            System.out.println();
            Thread.sleep(7777);
        }
    }
}
送一個須要賬號密碼登陸的例子
相關文章
相關標籤/搜索