來公司已經倆月了,天天加班平均工時11個小時的我又想起了老東家舒適溫馨安逸的生活。已經有很久沒時間讀博客寫博客了,我以爲我退步了,嗯嗯,我很不開心html
今天記錄下抓數據的一些東西吧。java
數據抓取如今是很廣泛的事情,有用Python的,固然我仍是很弱,我只能用java搞,如下就是正經話了。node
如下須要注意的:spring
1.首先有個目標,抓取的目標頁面apache
2.目標頁面的數據結構數據結構
3.目標網站是否有反爬蟲機制(就是會封你ip)app
4.數據解析之 存庫dom
獲取httpClientsocket
package com.app.utils; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.LayeredConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; import javax.net.ssl.SSLContext; import java.security.NoSuchAlgorithmException; @Component public class HttpConnectionManager { PoolingHttpClientConnectionManager cm = null; @PostConstruct public void init() { LayeredConnectionSocketFactory sslsf = null; try { sslsf = new SSLConnectionSocketFactory(SSLContext.getDefault()); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory> create() .register("https", sslsf) .register("http", new PlainConnectionSocketFactory()) .build(); cm =new PoolingHttpClientConnectionManager(socketFactoryRegistry); cm.setMaxTotal(200); cm.setDefaultMaxPerRoute(20); } public CloseableHttpClient getHttpClient() { CloseableHttpClient httpClient = HttpClients.custom() .setConnectionManager(cm) .build(); return httpClient; } }
抓取頁面工具類 由於不少網站都作了反爬蟲機制,它會在單位時間內限制你的訪問次數,若是你爬去的數據不停在變化,即時性要求很高,這就須要使用代理IP ide
package com.zyt.creenshot.util; import org.apache.commons.collections.MapUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; /** * @ClassName:DocumentHelper * @Description:<抓取頁面工具類> * @Author:zhaiyutao * @Data:2019/7/1 11:15 * @Vesion: v1.0 */ @Component public class DocumentHelper { @Autowired HttpConnectionManager connManager; public String getProxyHttp(String url, String address, int port, String charset) { CloseableHttpResponse response = null; CloseableHttpClient httpClient = connManager.getHttpClient(); try { //發送get請求 HttpGet httpGet = new HttpGet(url); //構建代理IP請求 httpGet = buildProxy(httpGet, address, port); Map<String, String> headerMap = new HashMap<String, String>(); headerMap.put("Referer", "http://*********.com/"); headerMap.put("Content-Type", "text/html; charset=utf-8"); headerMap.put("User-Agent", UserAgentUtil.getRandomUserAgent()); headerMap.put("Accept-Language", "zh-CN,zh;q=0.9"); headerMap.put("Accept-Encoding", "gzip, deflate"); //構建請求header頭 httpGet = buildRequestHeader(headerMap, httpGet); response = httpClient.execute(httpGet); response.addHeader("Content-Type", "text/html; charset=utf-8"); //獲取響應實體 HttpEntity entity = response.getEntity(); if (entity != null) { String content = EntityUtils.toString(entity); if(null != charset && !"".equals(charset)) { content = new String(content.getBytes(StandardCharsets.ISO_8859_1), charset); } return content; } } catch (Exception e){ //出現問題不處理異常 再次請求 最後統一處理 //log.error("代理解析內部出現問題 url {} address {} port {}",url,address,port); return ""; }finally { try { response.close(); } catch (Exception e) { } } return ""; } private static HttpGet buildProxy(HttpGet httpGet,String address,int port) { RequestConfig requestConfig = null; if(StringUtils.isNotEmpty(address)){ HttpHost proxy = new HttpHost(address, port); requestConfig = RequestConfig.custom() .setProxy(proxy) .setConnectTimeout(4000) .setSocketTimeout(8000) .setConnectionRequestTimeout(4000) .build(); }else{ requestConfig = RequestConfig.custom() .setConnectTimeout(4000) .setSocketTimeout(8000) .setConnectionRequestTimeout(4000) .build(); } httpGet.setConfig(requestConfig); return httpGet; } /** * 構建header頭 * @param headerMap * @param httpGet * @return */ private static HttpGet buildRequestHeader(Map<String,String> headerMap,HttpGet httpGet) { if (MapUtils.isNotEmpty(headerMap)) { for (Map.Entry<String,String> kv :headerMap.entrySet()) { httpGet.setHeader(kv.getKey(),kv.getValue()); } } return httpGet; } }
請求頭假裝工具類
package com.zyt.creenshot.util; import java.util.Random; public class UserAgentUtil { private static final String[] USER_AGENT = { "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", //google "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 OPR/60.0.3255.109", //opera "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE", //360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.7.3000", //遨遊 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", //firefox "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" //safari //"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763" //IE }; /** * 隨機獲取user-agent * @return */ public static String getRandomUserAgent() { Random random = new Random(); int i = random.nextInt(USER_AGENT.length); String userAgent = USER_AGENT[i]; return userAgent; } }
以下以爬取某個汽車網站數據爲例 固然表的建立須要本身根據須要設計 我就不貼具體表結構了
package com.zyt.creenshot.service.crawlerData.impl; import com.zyt.creenshot.entity.CarBaseData; import com.zyt.creenshot.mapper.CarBaseDataMapper; import com.zyt.creenshot.service.crawlerData.ICrawlerData; import com.zyt.creenshot.util.DocumentHelper; import com.zyt.creenshot.util.HttpConnectionManager; import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.util.ArrayList; import java.util.List; /** * @ClassName:CrawlerDataImpl * @Description:<TODO> * @Author:zhaiyutao * @Data:2019/7/8 17:48 * @Vesion: v1.0 */ @Component @Slf4j public class CrawlerDataImpl implements ICrawlerData { @Autowired private HttpConnectionManager connectionManager; @Autowired(required = false) private CarBaseDataMapper carBaseDataMapper; @Override public void crawlerCarBaseData() { String url = "***********要爬取的網址*************"; String resultHtml = DocumentHelper.getProxyHttp(url, null, 0, "GBK", connectionManager); if(StringUtils.isEmpty(resultHtml)){ log.error("沒有爬到網站數據"); } Document html = Jsoup.parse(resultHtml); //解析品牌 Elements brandList = html.select("div[class=braRow]"); if(null != brandList && brandList.size() > 0 ){ List<CarBaseData> listCar = new ArrayList<>(); // 獲取車的大品牌 for(Element brand : brandList){ Elements brandBig = brand.select("div[class=braRow-icon]"); //大品牌名稱 和 車標 String brandName = brandBig.select("p").text().replace("?","·"); String brandPic = brandBig.select("img[src]").attr("#src"); Elements smallBrandList = brand.select("div[class=modA noBorder]"); for( Element sb : smallBrandList){ Elements brandItem = sb.select("div[class=thA]"); // 細分品牌 String brandSmallName = brandItem.select("a[href]").text(); Elements sbInner = sb.select("div[class=tbA ]"); for(Element in : sbInner){ dealCarData(listCar, brandName, brandPic, brandSmallName, in); } Elements sbInnerNother = sb.select("div[class=tbA mt10 noBorder]"); for(Element inner : sbInnerNother){ dealCarData(listCar, brandName, brandPic, brandSmallName, inner); } } } if(CollectionUtils.isNotEmpty(listCar)){ carBaseDataMapper.insertBatch(listCar); } } } private void dealCarData(List<CarBaseData> listCar, String brandName, String brandPic, String brandSmallName, Element in) { String carTypeName = in.select("p[class=stit]").text().split("(")[0]; Elements li = in.select("li"); for(Element element : li){ Element tit = element.select("p[class=tit]").get(0); Element price = element.select("p[class=price]").get(0); Elements carHref = tit.select("a[href]"); String priceStr = price.text(); if(null != carHref){ String href = carHref.attr("href"); if(StringUtils.isEmpty(href)){ continue; } String carName = carHref.attr("title"); String carId = StringUtils.substring(href, 1, href.length() - 1); CarBaseData carBaseData = new CarBaseData(); carBaseData.setCarId(carId); carBaseData.setCarName(carName); carBaseData.setBrandName(brandName); carBaseData.setBrandPic(brandPic); carBaseData.setSubBrandName(brandSmallName); carBaseData.setCarType(carTypeName); carBaseData.setCarPrice(priceStr); listCar.add(carBaseData); } if(listCar.size()>=500){ carBaseDataMapper.insertBatch(listCar); listCar.clear(); } } } }
爬取的數據: