java編寫網站數據抓取

來公司已經倆月了,天天加班平均工時11個小時的我又想起了老東家舒適溫馨安逸的生活。已經有很久沒時間讀博客寫博客了,我以爲我退步了,嗯嗯,我很不開心html

 

今天記錄下抓數據的一些東西吧。java

 

數據抓取如今是很廣泛的事情,有用Python的,固然我仍是很弱,我只能用java搞,如下就是正經話了。node

 

如下須要注意的:spring

1.首先有個目標,抓取的目標頁面apache

2.目標頁面的數據結構數據結構

3.目標網站是否有反爬蟲機制(就是會封你ip)app

4.數據解析之 存庫dom

獲取httpClientsocket

package com.app.utils;

import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.stereotype.Component;

import javax.annotation.PostConstruct;
import javax.net.ssl.SSLContext;
import java.security.NoSuchAlgorithmException;

@Component
public class HttpConnectionManager {
    PoolingHttpClientConnectionManager cm = null;

    @PostConstruct
    public void init() {
        LayeredConnectionSocketFactory sslsf = null;
        try {
            sslsf = new SSLConnectionSocketFactory(SSLContext.getDefault());
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        }


        Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory> create()
                .register("https", sslsf)
                .register("http", new PlainConnectionSocketFactory())
                .build();
        cm =new PoolingHttpClientConnectionManager(socketFactoryRegistry);
        cm.setMaxTotal(200);
        cm.setDefaultMaxPerRoute(20);
    }

    public CloseableHttpClient getHttpClient() {
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .build();
        return httpClient;
    }
}

 

抓取頁面工具類 由於不少網站都作了反爬蟲機制,它會在單位時間內限制你的訪問次數,若是你爬去的數據不停在變化,即時性要求很高,這就須要使用代理IP ide

package com.zyt.creenshot.util;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

/**
 * @ClassName:DocumentHelper
 * @Description:<抓取頁面工具類>
 * @Author:zhaiyutao
 * @Data:2019/7/1 11:15
 * @Vesion: v1.0
 */
@Component
public class DocumentHelper {

    @Autowired
    HttpConnectionManager connManager;

    public String getProxyHttp(String url, String address, int port, String charset) {
        CloseableHttpResponse response = null;
        CloseableHttpClient httpClient = connManager.getHttpClient();
        try {
            //發送get請求
            HttpGet httpGet = new HttpGet(url);
            //構建代理IP請求
            httpGet = buildProxy(httpGet, address, port);
            Map<String, String> headerMap = new HashMap<String, String>();
            headerMap.put("Referer", "http://*********.com/");
            headerMap.put("Content-Type", "text/html; charset=utf-8");
            headerMap.put("User-Agent", UserAgentUtil.getRandomUserAgent());
            headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
            headerMap.put("Accept-Encoding", "gzip, deflate");
            //構建請求header頭
            httpGet = buildRequestHeader(headerMap, httpGet);
            response = httpClient.execute(httpGet);
            response.addHeader("Content-Type", "text/html; charset=utf-8");
            //獲取響應實體
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                String content = EntityUtils.toString(entity);
                if(null != charset && !"".equals(charset)) {
                    content = new String(content.getBytes(StandardCharsets.ISO_8859_1), charset);
                }
                return content;
            }
        } catch (Exception e){
            //出現問題不處理異常 再次請求 最後統一處理
            //log.error("代理解析內部出現問題 url {} address {} port {}",url,address,port);
            return "";
        }finally {
            try {
                response.close();
            } catch (Exception e) {
            }
        }
        return "";
    }

    private static HttpGet buildProxy(HttpGet httpGet,String address,int port) {
        RequestConfig requestConfig = null;
        if(StringUtils.isNotEmpty(address)){
            HttpHost proxy = new HttpHost(address, port);
            requestConfig = RequestConfig.custom()
                    .setProxy(proxy)
                    .setConnectTimeout(4000)
                    .setSocketTimeout(8000)
                    .setConnectionRequestTimeout(4000)
                    .build();
        }else{
            requestConfig = RequestConfig.custom()
                    .setConnectTimeout(4000)
                    .setSocketTimeout(8000)
                    .setConnectionRequestTimeout(4000)
                    .build();
        }
        httpGet.setConfig(requestConfig);
        return httpGet;
    }

    /**
     * 構建header頭
     * @param headerMap
     * @param httpGet
     * @return
     */
    private static HttpGet buildRequestHeader(Map<String,String> headerMap,HttpGet httpGet) {
        if (MapUtils.isNotEmpty(headerMap)) {
            for (Map.Entry<String,String> kv :headerMap.entrySet()) {
                httpGet.setHeader(kv.getKey(),kv.getValue());
            }
        }
        return httpGet;
    }

}

 

請求頭假裝工具類

package com.zyt.creenshot.util;

import java.util.Random;

public class UserAgentUtil {
    
    private static final String[] USER_AGENT = {
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", //google
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 OPR/60.0.3255.109", //opera
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE", //360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.7.3000", //遨遊
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", //firefox
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" //safari
        //"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763" //IE
    };
    
    /**
     * 隨機獲取user-agent
     * @return
     */
    public static String getRandomUserAgent() {
        Random random = new Random();
        int i = random.nextInt(USER_AGENT.length);
        String userAgent = USER_AGENT[i];
        return userAgent;
    }
    
}

 

以下以爬取某個汽車網站數據爲例 固然表的建立須要本身根據須要設計 我就不貼具體表結構了

package com.zyt.creenshot.service.crawlerData.impl;

import com.zyt.creenshot.entity.CarBaseData;
import com.zyt.creenshot.mapper.CarBaseDataMapper;
import com.zyt.creenshot.service.crawlerData.ICrawlerData;
import com.zyt.creenshot.util.DocumentHelper;
import com.zyt.creenshot.util.HttpConnectionManager;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName:CrawlerDataImpl
 * @Description:<TODO>
 * @Author:zhaiyutao
 * @Data:2019/7/8 17:48
 * @Vesion: v1.0
 */
@Component
@Slf4j
public class CrawlerDataImpl implements ICrawlerData {

    @Autowired
    private HttpConnectionManager connectionManager;

    @Autowired(required = false)
    private CarBaseDataMapper carBaseDataMapper;


    @Override
    public void crawlerCarBaseData() {

        String url = "***********要爬取的網址*************";
        String resultHtml = DocumentHelper.getProxyHttp(url, null, 0, "GBK", connectionManager);
        if(StringUtils.isEmpty(resultHtml)){
            log.error("沒有爬到網站數據");
        }
        Document html = Jsoup.parse(resultHtml);
        //解析品牌
        Elements brandList = html.select("div[class=braRow]");
        if(null != brandList && brandList.size() > 0 ){
            List<CarBaseData> listCar = new ArrayList<>();
            // 獲取車的大品牌
            for(Element brand : brandList){
                Elements brandBig = brand.select("div[class=braRow-icon]");
                //大品牌名稱 和 車標
                String brandName = brandBig.select("p").text().replace("?","·");
                String brandPic = brandBig.select("img[src]").attr("#src");

                Elements smallBrandList = brand.select("div[class=modA noBorder]");
                for( Element sb : smallBrandList){
                    Elements brandItem = sb.select("div[class=thA]");
                    // 細分品牌
                    String brandSmallName = brandItem.select("a[href]").text();

                    Elements sbInner = sb.select("div[class=tbA ]");
                    for(Element in : sbInner){
                        dealCarData(listCar, brandName, brandPic, brandSmallName, in);
                    }
                    Elements sbInnerNother = sb.select("div[class=tbA mt10 noBorder]");
                    for(Element inner : sbInnerNother){
                        dealCarData(listCar, brandName, brandPic, brandSmallName, inner);
                    }
                }
            }
            if(CollectionUtils.isNotEmpty(listCar)){
                carBaseDataMapper.insertBatch(listCar);
            }
        }
    }

    private void dealCarData(List<CarBaseData> listCar, String brandName, String brandPic, String brandSmallName, Element in) {
        String carTypeName = in.select("p[class=stit]").text().split("(")[0];
        Elements li = in.select("li");
        for(Element element : li){
            Element tit = element.select("p[class=tit]").get(0);
            Element price = element.select("p[class=price]").get(0);
            Elements carHref = tit.select("a[href]");
            String priceStr = price.text();
            if(null != carHref){
                String href = carHref.attr("href");
                if(StringUtils.isEmpty(href)){
                    continue;
                }
                String carName = carHref.attr("title");
                String carId = StringUtils.substring(href, 1, href.length() - 1);
                CarBaseData carBaseData = new CarBaseData();
                carBaseData.setCarId(carId);
                carBaseData.setCarName(carName);
                carBaseData.setBrandName(brandName);
                carBaseData.setBrandPic(brandPic);
                carBaseData.setSubBrandName(brandSmallName);
                carBaseData.setCarType(carTypeName);
                carBaseData.setCarPrice(priceStr);
                listCar.add(carBaseData);
            }
            if(listCar.size()>=500){
                carBaseDataMapper.insertBatch(listCar);
                listCar.clear();
            }
        }
    }
}

 爬取的數據:

相關文章
相關標籤/搜索