使用httpclient爬取某狐的招聘信息,能夠換成其餘網站。html
import java.io.IOException; import java.util.Collection; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.ResponseHandler; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; /** * 使用httpclient爬取某狐的招聘信息,能夠換成其餘網站 * * @author David */ public class HttpUtil { String sohuURL = "http://hr.sohu.com/wt/sohu/web/templet1000/index/corpwebPosition1000sohu!getPostListByCondition?pc.rowSize=1000&recruitType=2&keyWord=&positionType=&workPlace=&releaseTime=&trademark=0&brandCode=1&comPart=&showComp=true&searchSuffix="; @org.junit.Test public void test() { get(); } public void get() { CloseableHttpClient client = HttpClients.createDefault(); HttpUriRequest request = new HttpGet(sohuURL); request.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); request.addHeader("Accept-Encoding", "gzip, deflate"); request.addHeader("Accept-Language", "zh-CN,zh;q=0.8"); request.addHeader("Connection", "keep-alive"); request.addHeader("Accept", "text/plain, text/html"); request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"); try { HttpResponse execute = client.execute(request, new ResponseHandler<HttpResponse>() { @Override public HttpResponse handleResponse(HttpResponse resp) throws ClientProtocolException, IOException { if (resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity entity = resp.getEntity(); String st = EntityUtils.toString(entity, "UTF-8"); parseHtml(st); } return resp; } }); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private void parseHtml(String data) { // 僅簡單的使用正則表達式,能夠換成HTMLparser,HTMLclear來解析數據 Pattern compile = Pattern.compile("[\\u4e00-\\u9fa5]+-\\w*[\\u4e00-\\u9fa5]+-\\w*[\\u4e00-\\u9fa5]*"); Matcher matcher = compile.matcher(data); TreeMap<String, String> hashMap = new TreeMap<String, String>(); while (matcher.find()) { String group = matcher.group(); hashMap.put(group, group); } Collection<String> values = hashMap.values(); for (String string : values) { if (string.indexOf("Java") != -1) System.out.println(string); } } }