本人在使用 httpclient 的過程當中,忽然想起來能夠爬取一些數據,好比全國的中學名。固然不是空穴來風,以前也作過這方面的爬蟲,不過基於selenium 作的 UI 腳本,效率很是慢,並且很不穩定,因此此次採起了接口的形式,果真效率提高了幾個檔次。一共6萬+數據,用了16分鐘左右,期間包括數據庫的存儲。如今分享代碼供你們參考。關鍵信息隱去,你們看一下思路就行了。java
package practise; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.client.methods.HttpGet; import net.sf.json.JSONObject; import source.ApiLibrary; import source.Concurrent; public class Crawler extends ApiLibrary { public static String host = ""; public static Map<String, Integer> countrys = new HashMap<>(); public static Map<String, Integer> citys = new HashMap<>(); public static Map<String, Integer> address = new HashMap<>(); public static Map<String, Integer> school = new HashMap<>(); public static List<String> total = new ArrayList<>(); public static void main(String[] args) { Crawler crawler = new Crawler(); crawler.getCountry1();// 省份 Set<String> countryId = countrys.keySet(); for (String name : countryId) { int id = countrys.get(name); crawler.getCountry2(id);// 市 Set<String> cityId = citys.keySet(); for (String city : cityId) { int cid = citys.get(city); crawler.getCountry3(cid);// 縣 Set<String> adresss = address.keySet(); for (String adres : adresss) { int aid = address.get(adres); crawler.getCountry4(aid);// 名 Set<String> schol = school.keySet(); for (String sch : schol) { String line = name + PART + city + PART + adres + PART + sch; total.add(line); } } } } Concurrent.saveRequestTimes(total); testOver(); } /** * 查詢省份 */ public void getCountry1() { String url = host + "/user/editinfo/getSchollCountryList"; HttpGet httpGet = getHttpGet(url); // httpGet.addHeader("Cookie", cookies); // httpGet.addHeader("User-Agent", userangent); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] country = response.getString("content").split("</a>"); int size = country.length; for (int i = 0; i < size; i++) { String msg = country[i]; int code = getCode(msg); String name = getName(msg); countrys.put(name, code); } } /** * 查詢市 * * @param id */ public void getCountry2(int id) { String url = host + "/user/editinfo/getSchollCityList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; citys.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); citys.put(name, code); } } /** * 查詢縣 * * @param id */ public void getCountry3(int id) { String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; address.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); address.put(name, code); } } /** * 查詢學校 * * @param id */ public void getCountry4(int id) { String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id; HttpGet httpGet = getHttpGet(url); JSONObject response = getHttpResponseEntityByJson(httpGet); String[] ssString = response.getString("content").split("</a>"); int size = ssString.length; school.clear(); for (int i = 0; i < size; i++) { String msg = ssString[i]; int code = getCode(msg); String name = getName(msg); school.put(name, code); } } /** * 獲取 code * * @param text * @return */ public int getCode(String text) { int code = 0; Pattern pattern = Pattern.compile("\"\\d+\""); Matcher matcher = pattern.matcher(text); if (matcher.find()) { code = changeStringToInt(matcher.group(0).replace("\"", "")); } return code; } /** * 獲取名稱 * * @param text * @return */ public String getName(String text) { String name = text.substring(text.lastIndexOf(">") + 1, text.length()); return name; } }
下面是爬取到數據截圖python