使用httpclient爬取某狐招聘信息(爬蟲)

使用httpclient爬取某狐的招聘信息,能夠換成其餘網站。html

import java.io.IOException;
import java.util.Collection;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * 使用httpclient爬取某狐的招聘信息,能夠換成其餘網站
 * 
 * @author David
 */

public class HttpUtil {
	String sohuURL = "http://hr.sohu.com/wt/sohu/web/templet1000/index/corpwebPosition1000sohu!getPostListByCondition?pc.rowSize=1000&recruitType=2&keyWord=&positionType=&workPlace=&releaseTime=&trademark=0&brandCode=1&comPart=&showComp=true&searchSuffix=";

	@org.junit.Test
	public void test() {
		get();
	}

	public void get() {
		CloseableHttpClient client = HttpClients.createDefault();
		HttpUriRequest request = new HttpGet(sohuURL);
		request.addHeader("Accept",
				"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
		request.addHeader("Accept-Encoding", "gzip, deflate");
		request.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
		request.addHeader("Connection", "keep-alive");
		request.addHeader("Accept", "text/plain, text/html");
		request.addHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36");
		try {
			HttpResponse execute = client.execute(request, new ResponseHandler<HttpResponse>() {
				@Override
				public HttpResponse handleResponse(HttpResponse resp) throws ClientProtocolException, IOException {
					if (resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
						HttpEntity entity = resp.getEntity();
						String st = EntityUtils.toString(entity, "UTF-8");
						parseHtml(st);
					}
					return resp;
				}
			});

		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	private void parseHtml(String data) {
		// 僅簡單的使用正則表達式,能夠換成HTMLparser,HTMLclear來解析數據
		Pattern compile = Pattern.compile("[\\u4e00-\\u9fa5]+-\\w*[\\u4e00-\\u9fa5]+-\\w*[\\u4e00-\\u9fa5]*");
		Matcher matcher = compile.matcher(data);
		TreeMap<String, String> hashMap = new TreeMap<String, String>();
		while (matcher.find()) {
			String group = matcher.group();
			hashMap.put(group, group);
		}
		Collection<String> values = hashMap.values();
		for (String string : values) {
			if (string.indexOf("Java") != -1)
				System.out.println(string);
		}
	}
}
相關文章
相關標籤/搜索