獲取人人網的高校數據。html
人人網的院系接口,參數爲高校id: http://www.renren.com/GetDep.do?id=java
人人網 (國家/省/高校 js文件): http://s.xnimg.cn/a13819/allunivlist.js正則表達式
僅供學習參考sql
僅供學習參考apache
僅供學習參考json
1.經過fastJson解析獲取到的高校jsonapp
2.經過正則表達式解析獲取到的院系htmldom
package com.test; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.ResponseHandler; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.BasicResponseHandler; import org.apache.http.impl.client.DefaultHttpClient; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; /** * @desc 獲取人人網高校數據 * @author wjw * @date 2016年12月8日下午1:21:01 */ public class GetSchoolSQL { // 構建省的sql文件 public static File provinceFile = new File("D:/province.sql"); // 構建高校的sql文件 public static File schoolFile = new File("D:/school.sql"); // 構建院系的sql文件 public static File depFile = new File("D:/dep.sql"); public static void getDate() throws ClientProtocolException, IOException{ HttpClient client = new DefaultHttpClient(); ResponseHandler<String> responseHandler = new BasicResponseHandler(); String depUrl = "http://www.renren.com/GetDep.do?id="; String allunivs = "http://s.xnimg.cn/a13819/allunivlist.js"; HttpGet get = new HttpGet(allunivs); StringBuffer sb = new StringBuffer(client.execute(get, responseHandler)); String str=sb.toString(); System.out.println("獲取高校js文件完成,下一步解析js文件的JSON數據:"); JSONArray objList= JSONObject.parseArray(str.substring(str.indexOf("[{")));//獲取json部分 // 對獲取jsonarry 取中國部分id="00" JSONArray china = null;//取中國部分provs if(objList!=null){ for(int i=0;i<objList.size();i++){ JSONObject obj=objList.getJSONObject(i); if(obj.getIntValue("id") != 00){//搞不懂json中國家id爲字符串這裏須要int類型判斷 continue; }else{ china=obj.getJSONArray("provs"); } } } System.out.println("中國高校JSON數據:\r\n"+china.toJSONString()); System.out.println("=============================開始解析JSON=================================="); StringBuilder provinceSql = new StringBuilder();//省sql StringBuilder schoolSql = new StringBuilder();//大學sql StringBuilder depSql = new StringBuilder();//院系sql for(int i=0;china!=null && i<china.size();i++){//遍歷省 JSONObject pObj=china.getJSONObject(i); provinceSql.append("insert into province(pro_id,pro_name)values('"+pObj.getIntValue("id")+ "','"+ convertFromHex(pObj.getString("name")) + "');\n"); JSONArray univs = pObj.getJSONArray("univs");//univs 獲取省學校集合 for(int j=0;univs!=null && j<univs.size();j++){//遍歷省學校 JSONObject sObj=univs.getJSONObject(j); //添加省學校sql schoolSql.append("insert into school(pro_id,sch_id,sch_name)values('"+pObj.getIntValue("id")+ "','"+ sObj.getIntValue("id")+ "','"+ convertFromHex(sObj.getString("name"))+"');\n"); //獲取學校院系 HttpGet getDep = new HttpGet(depUrl + sObj.getIntValue("id")); ResponseHandler<String> depHandler = new BasicResponseHandler(); String depHtml=client.execute(getDep, depHandler);//獲取院系html // 下面解析學校對應的院系html Pattern pattern = Pattern.compile("<option value='&#(.*?)</option>",Pattern.MULTILINE); //排除value爲空的項 Matcher matcher = pattern.matcher(depHtml); while (matcher.find()) { String value = ""; String data = matcher.group();// date結果爲<option value='其它院系'>其它院系</option> Pattern p = Pattern.compile("value='(.*?)'", Pattern.MULTILINE); Matcher m = p.matcher(data); if (m.find()) { value = m.group(1); } String id = UUID.randomUUID().toString().replaceAll("-", "").toUpperCase(); depSql.append("insert into dep(id,sch_id,dep_name)values('"+id+"','"+sObj.getIntValue("id") +"','"+convertFromDec(value)+"');\n"); } } } System.out.println("=============================JSON解析完成=================================="); PrintStream ps = new PrintStream(provinceFile); ps.print(provinceSql.toString()); ps.close(); PrintStream ps2 = new PrintStream(schoolFile); ps2.print(schoolSql.toString()); ps2.close(); PrintStream ps3 = new PrintStream(depFile); ps3.print(depSql.toString()); ps3.close(); System.out.println("sql文件已經生成!"); System.out.println("省sql文件"+provinceFile.getAbsolutePath()); System.out.println("學校sql文件"+schoolFile.getAbsolutePath()); System.out.println("院系sql文件"+depFile.getAbsolutePath()); } public static void main(String[] args){ try { getDate(); } catch (IOException e) { e.printStackTrace(); } } // 轉換&#xxxxx;形式Unicode private static String convertFromDec(String code) { StringBuffer sb = new StringBuffer(code); int startPos; int endPos; while ((startPos = sb.indexOf("&#")) > -1) { endPos = sb.indexOf(";"); String tmp = sb.substring(startPos + 2, endPos); sb.replace(startPos, endPos + 1, Character.toString((char) Integer .parseInt(tmp, 10))); } return code = sb.toString(); } // 轉換16進制的Unicode, private static String convertFromHex(String code) { StringBuffer sb = new StringBuffer(code); int pos; while ((pos = sb.indexOf("\\u")) > -1) { String tmp = sb.substring(pos, pos + 6); sb.replace(pos, pos + 6, Character.toString((char) Integer .parseInt(tmp.substring(2), 16))); } return code = sb.toString(); } }