《基於Java的數據採集(一)》:http://www.cnblogs.com/lichenwei/p/3904715.htmlphp
《基於Java的數據採集(二)》:http://www.cnblogs.com/lichenwei/p/3905370.htmlhtml
《基於Java的數據採集(終結篇)》:http://www.cnblogs.com/lichenwei/p/3910492.htmljava
基於以前2篇Java數據採集入庫,作了下功能整合,實現本地的存讀取,上個效果圖:mysql
直接上代碼吧,本程序只是做爲"如何用JAVA抓取頁面簡單採集入庫"的入門,在實際作採集工具的時候,還需考慮許多東西,好比當採集一個頁面發生卡頓時,發生延遲時怎麼辦?等一系列的問題,但願這篇文字可以拋磚引玉。正則表達式
先看下項目結構:sql
一共有五個類:數據庫
Mysql.java --數據庫操做類數組
RegEX.java --正則匹配類curl
GetAllData.java --採集類ide
Action.java --功能實現類
FootBallMain.java --主程序類
其餘的,直接結合前面2篇文章外加看代碼註釋吧
Mysql.java
1 package com.lcw.curl; 2 3 4 import java.sql.Connection; 5 import java.sql.DriverManager; 6 import java.sql.ResultSet; 7 import java.sql.SQLException; 8 import java.sql.Statement; 9 10 11 /** 12 * 數據庫操做類,一更新,一查詢 13 * @author Balla_兔子 14 * 15 */ 16 public class MySql { 17 18 //定義MySql驅動,數據庫地址,數據庫用戶名 密碼, 執行語句和數據庫鏈接 19 public String driver = "com.mysql.jdbc.Driver"; 20 public String url = "jdbc:mysql://127.0.0.1:3306/football"; 21 public String user = "root"; 22 public String password = ""; 23 public Statement stmt = null; 24 public Connection conn = null; 25 26 //建立一個插入數據的方法 27 public void datatoMySql(String insertSQl) { 28 29 try { 30 try { 31 Class.forName(driver).newInstance(); 32 } catch (Exception e) { 33 System.out.println("Unable to find the local driver"); 34 e.printStackTrace(); 35 } 36 //建立鏈接 37 conn = DriverManager.getConnection(url, user, password); 38 //建立一個 Statement 對象來將 SQL 語句發送到數據庫 39 stmt = conn.createStatement(); 40 } catch (SQLException e) { 41 e.printStackTrace(); 42 } 43 try { 44 //執行SQL 插入語句 45 stmt.executeUpdate(insertSQl); 46 } catch (SQLException e) { 47 e.printStackTrace(); 48 } 49 try { 50 stmt.close(); 51 conn.close(); 52 } catch (SQLException e) { 53 e.printStackTrace(); 54 } 55 } 56 57 58 //建立一個查找數據的方法 59 public ResultSet searchMySql(String selectSQl) { 60 61 ResultSet rs=null; 62 63 try { 64 try { 65 Class.forName(driver).newInstance(); 66 } catch (Exception e) { 67 System.out.println("Unable to find the local driver"); 68 e.printStackTrace(); 69 } 70 //建立鏈接 71 conn = DriverManager.getConnection(url, user, password); 72 //建立一個 Statement 對象來將 SQL 語句發送到數據庫 73 stmt = conn.createStatement(); 74 } catch (SQLException e) { 75 e.printStackTrace(); 76 } 77 try { 78 //執行SQL 插入語句 79 rs=stmt.executeQuery(selectSQl); 80 } catch (SQLException e) { 81 e.printStackTrace(); 82 } 83 84 return rs; 85 } 86 87 }
RegEX.java
1 package com.lcw.curl; 2 3 import java.util.regex.Matcher; 4 import java.util.regex.Pattern; 5 6 public class RegEX { 7 8 /** 9 * 10 * @param regex 11 * 正則表達式 12 * @param content 13 * 所要匹配的內容 14 * @return 15 */ 16 public String getData(String regex, String content) { 17 Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);// 設定正則表達式,不區分大小寫 18 Matcher matcher = pattern.matcher(content); 19 if (matcher.find()) { 20 return matcher.group();//返回正則匹配結果 21 } else { 22 return ""; 23 } 24 } 25 26 }
GetAllData.java
1 package com.lcw.curl; 2 3 import java.io.BufferedReader; 4 import java.io.InputStreamReader; 5 import java.net.URL; 6 7 public class GetAllData { 8 9 /**採集類 10 * @param Balla_兔子 11 */ 12 public void getAllData() { 13 14 try { 15 String address = "http://www.footballresults.org/league.php?league=EngDiv1"; 16 URL url = new URL(address); 17 InputStreamReader inputStreamReader = new InputStreamReader(url 18 .openStream(), "utf-8");// 打開地址,以UTF-8編碼的形式返回字節並轉爲字符 19 BufferedReader bufferedReader = new BufferedReader( 20 inputStreamReader);// 從字符輸入流中讀取文本,緩衝各個字符,從而提供字符、數組和行的高效讀取。 21 22 RegEX data = new RegEX(); 23 MySql mySql = new MySql(); 24 String content = "";// 用來接受每次讀取的行字符 25 int flag = 0;// 標誌,隊伍信息恰好在日期信息後面,則正則相同,用於分離數據 26 String dateRegex = "\\d{1,2}\\.\\d{1,2}\\.\\d{4}";// 日期匹配正則表達式 27 String teamRegex = ">[^<>]*</a>";// 隊伍匹配正則表達式 28 String scoreRegex = ">(\\d{1,2}-\\d{1,2})</TD>";// 比分正則表達式 29 String tempDate = "";// 存儲臨時比賽時間 30 String teama = "";// 存儲臨時主隊 31 String teamb = "";// 存儲臨時客隊 32 String score = "";// 存儲臨時比分 33 int i = 0;// 記錄信息條數 34 String sql = "";// 數據庫語句 35 36 while ((content = bufferedReader.readLine()) != null) {// 每次讀取一行數據 37 // 獲取比賽日期信息 38 String dateInfo = data.getData(dateRegex, content); 39 if (!dateInfo.equals("")) { 40 // System.out.println("日期:" + dateInfo); 41 tempDate = dateInfo; 42 flag++; 43 } 44 // 獲取隊伍信息,需先讀到日期信息讓標誌符自增 45 String teamInfo = data.getData(teamRegex, content); 46 if (!teamInfo.equals("") && flag == 1) { 47 teama = teamInfo.substring(1, teamInfo.indexOf("</a>")); 48 // System.out.println("主隊:" + teama); 49 flag++; 50 } else if (!teamInfo.equals("") && flag == 2) { 51 teamb = teamInfo.substring(1, teamInfo.indexOf("</a>")); 52 // System.out.println("客隊:" + teamb); 53 flag = 0; 54 } 55 // 獲取比分信息 56 String scoreInfo = data.getData(scoreRegex, content); 57 if (!scoreInfo.equals("")) { 58 score = scoreInfo.substring(1, scoreInfo.indexOf("</TD>")); 59 // System.out.println("比分:" + score); 60 // System.out.println(); 61 i++; 62 sql = "insert into football(`date`,`teama`,`teamb`,`score`) values('" 63 + tempDate 64 + "','" 65 + teama 66 + "','" 67 + teamb 68 + "','" 69 + score + "')"; 70 mySql.datatoMySql(sql); 71 System.out.println("存儲數據成功:" + i + "條"); 72 } 73 74 } 75 bufferedReader.close(); 76 // System.out.println("一共收集到了" + i + "條信息"); 77 } catch (Exception e) { 78 e.printStackTrace(); 79 } 80 81 } 82 83 }
Action.java
1 package com.lcw.curl; 2 3 import java.sql.ResultSet; 4 import java.sql.SQLException; 5 import java.util.ArrayList; 6 import java.util.List; 7 import java.util.Vector; 8 9 public class Action { 10 11 /** 12 * 操做一:初始化數據庫數據 13 */ 14 public void initData() { 15 String sql = "delete from football"; 16 MySql doMySql = new MySql(); 17 try { 18 doMySql.datatoMySql(sql); 19 System.out.println("數據初始化完畢!"); 20 } catch (Exception e) { 21 System.out.println("數據初始化失敗!"); 22 } 23 24 } 25 26 /** 27 * 獲取全部隊伍信息 28 * 29 * @return 30 */ 31 public Vector<String> getAllTeam() { 32 ResultSet rs = null; 33 Vector<String> vector = new Vector<String>(); 34 String sql = "select teama,teamb from football"; 35 MySql doMySql = new MySql(); 36 rs = doMySql.searchMySql(sql); 37 38 try { 39 while (rs.next()) { 40 try { 41 if (!vector.contains(rs.getString("teama"))) { 42 vector.add(rs.getString("teama")); 43 } 44 if (!vector.contains(rs.getString("teamb"))) { 45 vector.add(rs.getString("teamb")); 46 } 47 } catch (SQLException e) { 48 e.printStackTrace(); 49 } 50 } 51 } catch (SQLException e) { 52 e.printStackTrace(); 53 } 54 55 return vector; 56 57 } 58 59 /** 60 * 獲取具體某隊的比賽信息 61 * 62 * @param team 63 * @return 64 */ 65 public List<String> findTeam(String team) { 66 List<String> list = new ArrayList<String>(); 67 String sql = "select * from football where teama ='" + team 68 + "' or teamb ='" + team + "'"; 69 MySql mysql = new MySql(); 70 ResultSet rs = null; 71 rs = mysql.searchMySql(sql); 72 try { 73 while (rs.next()) { 74 list.add(rs.getString("date")); 75 list.add(rs.getString("teama")); 76 list.add(rs.getString("teamb")); 77 list.add(rs.getString("score")); 78 } 79 } catch (SQLException e) { 80 e.printStackTrace(); 81 } 82 return list; 83 84 } 85 86 public List<String> findGame(String date) { 87 List<String> list = new ArrayList<String>(); 88 ResultSet rs = null; 89 String sql = "select * from football where date ='" + date + "'"; 90 MySql mysql = new MySql(); 91 rs = mysql.searchMySql(sql); 92 try { 93 while (rs.next()) { 94 list.add(rs.getString("date")); 95 list.add(rs.getString("teama")); 96 list.add(rs.getString("teamb")); 97 list.add(rs.getString("score")); 98 } 99 } catch (SQLException e) { 100 // TODO Auto-generated catch block 101 e.printStackTrace(); 102 } 103 return list; 104 } 105 106 }
FootBallMain.java
1 package com.lcw.curl; 2 3 import java.util.List; 4 import java.util.Scanner; 5 import java.util.Vector; 6 7 public class FootBallMain { 8 9 /**主程序類 10 * @param Balla_兔子 11 */ 12 public static void main(String[] args) { 13 GetAllData allData = new GetAllData(); 14 Action action = new Action(); 15 16 while (true) { 17 System.out.println("①初始化數據庫-請按 (1)"); 18 System.out.println("②自動化採集數據-請按(2)"); 19 System.out.println("③查詢參賽隊伍-請按(3)"); 20 System.out.println("④查詢具體球隊比賽結果-請按(4)"); 21 System.out.println("⑤查詢具體某天的比賽詳情-請按(5)"); 22 Scanner scanner = new Scanner(System.in); 23 String input = scanner.next(); 24 if (input.equals("1")) { 25 System.out.println(); 26 action.initData(); 27 System.out 28 .println("-----------------------------------------------------"); 29 } else if (input.equals("2")) { 30 System.out.println("正在採集數據...請稍後"); 31 allData.getAllData(); 32 System.out 33 .println("-----------------------------------------------------"); 34 } else if (input.equals("3")) { 35 Vector<String> allTeam = action.getAllTeam(); 36 System.out.println("正在獲取數據...請稍後"); 37 if (allTeam.size() != 0) { 38 System.out.println("參賽隊伍以下:"); 39 for (int i = 0; i < allTeam.size(); i++) { 40 System.out.println(allTeam.get(i)); 41 } 42 } 43 System.out 44 .println("-----------------------------------------------------"); 45 } else if (input.equals("4")) { 46 System.out.println("請輸入您要查詢的隊伍名:"); 47 String team = scanner.next(); 48 List<String> list = action.findTeam(team); 49 System.out.println("比賽日期\t\t\t主隊\t\t客隊\t\t\t比賽結果"); 50 if (list.size() != 0) { 51 for (int i = 0; i < list.size(); i++) { 52 System.out.print(list.get(i) + "\t\t"); 53 } 54 } else { 55 System.out.println("暫時沒有您所提供隊伍的比賽信息,敬請關注..."); 56 } 57 System.out.println(); 58 System.out 59 .println("-----------------------------------------------------"); 60 } else if (input.equals("5")) { 61 System.out.println("請輸入您要查詢日期(格式以下:xx.xx.xxxx):"); 62 String date = scanner.next(); 63 List<String> info = action.findGame(date); 64 System.out.println("比賽日期\t\t\t主隊\t\t客隊\t\t\t比賽結果"); 65 if (info.size() != 0) { 66 for (int i = 0; i < info.size(); i++) { 67 if (i % 4 == 0 && i != 0) { 68 System.out.println(); 69 } 70 System.out.print(info.get(i) + "\t\t"); 71 } 72 } else { 73 System.out.println("暫時沒有您所提供的比賽信息,敬請關注..."); 74 } 75 System.out.println(); 76 System.out 77 .println("------------------------------------------------------------------------"); 78 } else { 79 System.out.println("請輸入正確的對應編號.."); 80 System.out 81 .println("------------------------------------------------------------------------"); 82 } 83 } 84 } 85 86 }