httpclient爬蟲爬取電影信息和下載地址實例

本次更新主要解決了老舊頁面下載連接多是迅雷和ftp格式的,還有就是去重,由於每一頁有一個推薦列表,裏面也會有相應的詳情連接,還有兼容了另外的頁面格式,更新了兩個方法:java

public static void spider(int pa) {
        List<String> page = getPage(pa);
        String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
        List<String> list = Arrays.asList(abc);
        page.removeAll(list);
        output(page.size());
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3) + 3);
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static void spider(String text) {
        List<String> page = getPage(text);
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3));
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static List<String> getPage(int page) {
        String url = "http://www.***.net/ys/index_" + page + ".htm";
        if (page == 1) url = "http://www.***.net/ys/";
        output(url);
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String content = response.getString("content");
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static List<String> getPage(String page) {
        String content = page;
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
        String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
        getMovieInfo(url);
        return true;
    }
 
    public static boolean getMovieInfo(String url) {
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        String name = EMPTY, tname = EMPTY, year = EMPTY, language = EMPTY, date = EMPTY, score = EMPTY, length = EMPTY, author = EMPTY;
        if (all.contains("◎")) {
            int i = all.indexOf("◎");
            int i1 = all.indexOf("<hr");
            String info = s.substring(i, i1);
            name = getInfo(info, "片  名 ");
            tname = getInfo(info, "譯  名 ");
            year = getInfo(info, "年  代 ");
            language = getInfo(info, "語  言 ");
            date = getInfo(info, "上映日期 ");
            score = getInfo(info, "豆瓣評分 ");
            length = getInfo(info, "片  長 ");
            author = getInfo(info, "導  演 ");
        } else {
            name = getInfo(all, "<title>");
            if (name.contains("_")) name = name.substring(0, name.indexOf("_"));
            length = getInfo(all, "片長: ");
            date = getInfo(all, "上映日期: ");
            author = getInfo(all, "導演: ");
            language = getInfo(all, "語言: ");
        }
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        if (ed2ks.size() == 0) ed2ks = regexAll(all, "ftp://.+?>");
        if (ed2ks.size() == 0) ed2ks = regexAll(all, "thunder://.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        if (ed2ks.size() != 0) MySqlTest.sendWork(sql);
        output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
        output(sql);
        return true;
    }
 
    public static String getInfo(String text, String start) {
        String value = EMPTY;
        List<String> nameinfo = regexAll(text, start + ".+?<");
        if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
        return value;
    }

----------------------------------分割線-------------------------------------------------sql

本人使用httpclient爬蟲過程當中,想爬取關注的一個電影網站的下載地址。在通過嘗試以後,終於成功爬取了幾百部熱門電影的信息和下載地址(電驢和磁力連接)。中間遇到了編碼格式,正則匹配不一致,還有重複連接過濾等問題,也都一一搞定。附上代碼,供你們參考。數據庫

關鍵信息隱藏,思路供你們參考:先去訪問列表頁,拿到詳情頁的連接,去重以後去訪問詳情頁,拿到相關信息和下載地址,存儲到數據庫中。編程

public class MyTest extends ApiLibrary {
    public static void main(String[] args) {
        DEFAULT_CHARSET = GB2312;
        for (int i = 0; i < 10; i++) {
            spider(1);
        }
 
        testOver();
    }
 
    public static void spider(int pa) {
        String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
        List<String> list = Arrays.asList(abc);
        page.removeAll(list);
        Set<String> truelist = new HashSet<>();
        page.forEach(l -> truelist.add(l));
        truelist.forEach(p -> {
            try {
                getMovieInfo(p);
                sleep(getRandomInt(3));
            } catch (Exception e) {
                output(p);
            }
        });
    }
 
    public static List<String> getPage(int page) {
        String url = "http://www.***.net/ys/index_" + page + ".htm";
        if (page == 1) url = "http://www.***.net/ys/";
        output(url);
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String content = response.getString("content");
        output(content);
        byte[] bytes = content.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
        return list;
    }
 
    public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
        String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        int i = all.indexOf("◎");
        int i1 = all.indexOf("<hr");
        String info = s.substring(i, i1);
        String name = getInfo(info, "片  名 ");
        String tname = getInfo(info, "譯  名 ");
        String year = getInfo(info, "年  代 ");
        String language = getInfo(info, "語  言 ");
        String date = getInfo(info, "上映日期 ");
        String score = getInfo(info, "豆瓣評分 ");
        String length = getInfo(info, "片  長 ");
        String author = getInfo(info, "導  演 ");
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        MySqlTest.sendWork(sql);
        return true;
    }
 
    public static boolean getMovieInfo(String url) {
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponse(httpGet);
        String s = response.getString("content");
        if (s.contains("您查詢的內容不存在,請返回首頁從新搜索")) return false;
        byte[] bytes = s.getBytes(UTF_8);
        String all = new String(bytes, UTF_8);
        int i = all.indexOf("◎");
        int i1 = all.indexOf("<hr");
        String info = s.substring(i, i1);
        String name = getInfo(info, "片  名 ");
        String tname = getInfo(info, "譯  名 ");
        String year = getInfo(info, "年  代 ");
        String language = getInfo(info, "語  言 ");
        String date = getInfo(info, "上映日期 ");
        String score = getInfo(info, "豆瓣評分 ");
        String length = getInfo(info, "片  長 ");
        String author = getInfo(info, "導  演 ");
        List<String> magnets = regexAll(all, "magnet:.+?>");
        List<String> ed2ks = regexAll(all, "ed2k:.+?>");
        List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
        String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
        sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
        MySqlTest.sendWork(sql);
        output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
        output(sql);
        return true;
    }
 
    public static String getInfo(String text, String start) {
        String value = EMPTY;
        List<String> nameinfo = regexAll(text, start + ".+?<");
        if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
        return value;
    }
 
}

下面是數據庫存儲的截圖: 框架

技術類文章精選

  1. java一行代碼打印心形
  2. Linux性能監控軟件netdata中文漢化版
  3. 接口測試代碼覆蓋率(jacoco)方案分享
  4. 性能測試框架
  5. 如何在Linux命令行界面愉快進行性能測試
  6. 圖解HTTP腦圖
  7. 如何測試機率型業務接口
  8. httpclient處理多用戶同時在線
  9. 將swagger文檔自動變成測試代碼
  10. 五行代碼構建靜態博客
  11. httpclient如何處理302重定向
  12. 基於java的直線型接口測試框架初探
  13. Tcloud 雲測平臺--集大成者

非技術文章精選

  1. 爲何選擇軟件測試做爲職業道路?
  2. 成爲傑出Java開發人員的10個步驟
  3. 寫給全部人的編程思惟
  4. 自動化測試的障礙
  5. 自動化測試的問題所在
  6. 測試之《代碼不朽》腦圖
  7. 成爲優秀自動化測試工程師的7個步驟
  8. 優秀軟件開發人員的態度
  9. 如何正確執行功能API測試

點擊查看公衆號地圖

相關文章
相關標籤/搜索