httpclient 爬蟲實例——爬取三級中學名

本人在使用 httpclient 的過程當中,忽然想起來能夠爬取一些數據,好比全國的中學名。固然不是空穴來風,以前也作過這方面的爬蟲,不過基於selenium 作的 UI 腳本,效率很是慢,並且很不穩定,因此此次採起了接口的形式,果真效率提高了幾個檔次。一共6萬+數據,用了16分鐘左右,期間包括數據庫的存儲。如今分享代碼供你們參考。關鍵信息隱去,你們看一下思路就行了。java

package practise;
 
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.methods.HttpGet;
import net.sf.json.JSONObject;
import source.ApiLibrary;
import source.Concurrent;
 
public class Crawler extends ApiLibrary {
	public static String host = "";
	public static Map<String, Integer> countrys = new HashMap<>();
	public static Map<String, Integer> citys = new HashMap<>();
	public static Map<String, Integer> address = new HashMap<>();
	public static Map<String, Integer> school = new HashMap<>();
	public static List<String> total = new ArrayList<>();
 
	public static void main(String[] args) {
		Crawler crawler = new Crawler();
		crawler.getCountry1();// 省份
		Set<String> countryId = countrys.keySet();
		for (String name : countryId) {
			int id = countrys.get(name);
			crawler.getCountry2(id);// 市
			Set<String> cityId = citys.keySet();
			for (String city : cityId) {
				int cid = citys.get(city);
				crawler.getCountry3(cid);// 縣
				Set<String> adresss = address.keySet();
				for (String adres : adresss) {
					int aid = address.get(adres);
					crawler.getCountry4(aid);// 名
					Set<String> schol = school.keySet();
					for (String sch : schol) {
						String line = name + PART + city + PART + adres + PART + sch;
						total.add(line);
					}
				}
			}
		}
		Concurrent.saveRequestTimes(total);
		testOver();
	}
 
	/**
	 * 查詢省份
	 */
	public void getCountry1() {
		String url = host + "/user/editinfo/getSchollCountryList";
		HttpGet httpGet = getHttpGet(url);
		// httpGet.addHeader("Cookie", cookies);
		// httpGet.addHeader("User-Agent", userangent);
		JSONObject response = getHttpResponseEntityByJson(httpGet);
		String[] country = response.getString("content").split("</a>");
		int size = country.length;
		for (int i = 0; i < size; i++) {
			String msg = country[i];
			int code = getCode(msg);
			String name = getName(msg);
			countrys.put(name, code);
		}
	}
 
	/**
	 * 查詢市
	 * 
	 * @param id
	 */
	public void getCountry2(int id) {
		String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
		HttpGet httpGet = getHttpGet(url);
		JSONObject response = getHttpResponseEntityByJson(httpGet);
		String[] ssString = response.getString("content").split("</a>");
		int size = ssString.length;
		citys.clear();
		for (int i = 0; i < size; i++) {
			String msg = ssString[i];
			int code = getCode(msg);
			String name = getName(msg);
			citys.put(name, code);
		}
 
	}
 
	/**
	 * 查詢縣
	 * 
	 * @param id
	 */
	public void getCountry3(int id) {
		String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
		HttpGet httpGet = getHttpGet(url);
		JSONObject response = getHttpResponseEntityByJson(httpGet);
		String[] ssString = response.getString("content").split("</a>");
		int size = ssString.length;
		address.clear();
		for (int i = 0; i < size; i++) {
			String msg = ssString[i];
			int code = getCode(msg);
			String name = getName(msg);
			address.put(name, code);
		}
	}
 
	/**
	 * 查詢學校
	 * 
	 * @param id
	 */
	public void getCountry4(int id) {
		String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
		HttpGet httpGet = getHttpGet(url);
		JSONObject response = getHttpResponseEntityByJson(httpGet);
		String[] ssString = response.getString("content").split("</a>");
		int size = ssString.length;
		school.clear();
		for (int i = 0; i < size; i++) {
			String msg = ssString[i];
			int code = getCode(msg);
			String name = getName(msg);
			school.put(name, code);
		}
	}
 
	/**
	 * 獲取 code
	 * 
	 * @param text
	 * @return
	 */
	public int getCode(String text) {
		int code = 0;
		Pattern pattern = Pattern.compile("\"\\d+\"");
		Matcher matcher = pattern.matcher(text);
		if (matcher.find()) {
			code = changeStringToInt(matcher.group(0).replace("\"", ""));
		}
		return code;
	}
 
	/**
	 * 獲取名稱
	 * 
	 * @param text
	 * @return
	 */
	public String getName(String text) {
		String name = text.substring(text.lastIndexOf(">") + 1, text.length());
		return name;
	}
 
}

下面是爬取到數據截圖python

技術類文章精選

非技術文章精選

大咖風采

點擊查看公衆號地圖

相關文章
相關標籤/搜索