java爬蟲案例學習

最近幾天很無聊,學習了一下java的爬蟲,寫一些本身在作這個案例的過程當中遇到的問題和一些體會
1.學習目標
         練習爬取京東的數據,圖片+價格+標題等等html

2.學習過程java

      1·開發工具
           JDK1.8
           IntelliJ IDEA
           IDEA自帶的Mavennode

       2.使用技術
           Spring Boot+Spring Data JPA
       3.數據庫準備mysql

CREATE TABLE `jd_item` (
  `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主鍵id',
  `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
  `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品類單元id',
  `title` varchar(100) DEFAULT NULL COMMENT '商品標題',
  `price` bigint(10) DEFAULT NULL COMMENT '商品價格',
  `pic` varchar(200) DEFAULT NULL COMMENT '商品圖片',
  `url` varchar(200) DEFAULT NULL COMMENT '商品詳情地址',
  `created` datetime DEFAULT NULL COMMENT '建立時間',
  `updated` datetime DEFAULT NULL COMMENT '更新時間',
  PRIMARY KEY (`id`),
  KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京東商品表';

        4.添加依賴web

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.0.2.RELEASE</version>
    </parent>
    <groupId>cn.itcast.crawler</groupId>
    <artifactId>itcast-crawler-jd</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--MySQL鏈接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <!-- HttpClient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

        <!--Jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>

        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
    </dependencies>
</project>

   5.添加配置文件spring

#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler
spring.datasource.username=root
spring.datasource.password=root

#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true

  6.代碼實現sql

1.pojo數據庫

@Entity
@Table(name = "jd_item")
public class Item {
    //主鍵
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    //標準產品單位(商品集合)
    private Long spu;
    //庫存量單位(最小品類單元)
    private Long sku;
    //商品標題
    private String title;
    //商品價格
    private Double price;
    //商品圖片
    private String pic;
    //商品詳情地址
    private String url;
    //建立時間
    private Date created;
    //更新時間
    private Date updated; 
set/get
}

2.編寫daoapache

public interface ItemDao extends JpaRepository<Item,Long> {
    
}

3.編寫service接口json

public interface ItemService {

    //根據條件查詢數據
    public List<Item> findAll(Item item);

    //保存數據
    public void save(Item item);
}

4.ItemServiceImpl實現類

@Service
public class ItemServiceImpl implements ItemService {

    @Autowired
    private ItemDao itemDao;

    @Override
    public List<Item> findAll(Item item) {
        Example example = Example.of(item);
        List list = this.itemDao.findAll(example);
        return list;
    }

    @Override
    @Transactional
    public void save(Item item) {
        this.itemDao.save(item);
    }
}

5.編寫引導類

@SpringBootApplication
//設置開啓定時任務
@EnableScheduling
public class Application {

    public static void main(String[] args) {
        SpringApplication.run(Application.class, args);
    }
}

6. 封裝HttpClient

@Component
public class HttpUtils {

    private PoolingHttpClientConnectionManager cm;

    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();

        //    設置最大鏈接數
        cm.setMaxTotal(200);

        //    設置每一個主機的併發數
        cm.setDefaultMaxPerRoute(20);
    }

    //獲取內容
    public String getHtml(String url) {
        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response返回數據
            if (response.getStatusLine().getStatusCode() == 200) {
                String html = "";

                // 若是response。getEntity獲取的結果是空,在執行EntityUtils.toString會報錯
                // 須要對Entity進行非空的判斷
                if (response.getEntity() != null) {
                    html = EntityUtils.toString(response.getEntity(), "UTF-8");
                }

                return html;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉鏈接
                    response.close();
                }
                // 不能關閉,如今使用的是鏈接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return null;
    }

    //獲取圖片
    public String getImage(String url) {
        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response下載圖片
            if (response.getStatusLine().getStatusCode() == 200) {
                // 獲取文件類型
                String extName = url.substring(url.lastIndexOf("."));
                // 使用uuid生成圖片名
                String imageName = UUID.randomUUID().toString() + extName;

                // 聲明輸出的文件
                OutputStream outstream = new FileOutputStream(new File("D:/images/" + imageName));
                // 使用響應體輸出文件
                response.getEntity().writeTo(outstream);

                // 返回生成的圖片名
                return imageName;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉鏈接
                    response.close();
                }
                // 不能關閉,如今使用的是鏈接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return null;
    }

    //獲取請求參數對象
    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 設置建立鏈接的超時時間
                .setConnectionRequestTimeout(500) // 設置獲取鏈接的超時時間
                .setSocketTimeout(10000) // 設置鏈接的超時時間
                .build();

        return config;
    }

}

7. 實現數據抓取

@Component
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ItemService itemService;

    public static final ObjectMapper MAPPER = new ObjectMapper();


    //設置定時任務執行完成後,再間隔100秒執行一次
    @Scheduled(fixedDelay = 1000 * 100)
    public void process() throws Exception {
        //分析頁面發現訪問的地址,頁碼page從1開始,下一頁oage加2
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&s=5760&click=0&page=";

        //遍歷執行,獲取全部的數據
        for (int i = 1; i < 10; i = i + 2) {
            //發起請求進行訪問,獲取頁面數據,先訪問第一頁
            String html = this.httpUtils.getHtml(url + i);

            //解析頁面數據,保存數據到數據庫中
            this.parseHtml(html);

        }
        System.out.println("執行完成");
    }


    //解析頁面,並把數據保存到數據庫中
    private void parseHtml(String html) throws Exception {
        //使用jsoup解析頁面
        Document document = Jsoup.parse(html);

        //獲取商品數據
        Elements spus = document.select("div#J_goodsList > ul > li");

        //遍歷商品spu數據
        for (Element spuEle : spus) {
            //獲取商品spu
            Long spuId = Long.parseLong(spuEle.attr("data-spu"));

            //獲取商品sku數據
            Elements skus = spuEle.select("li.ps-item img");
            for (Element skuEle : skus) {
                //獲取商品sku
                Long skuId = Long.parseLong(skuEle.attr("data-sku"));

                //判斷商品是否被抓取過,能夠根據sku判斷
                Item param = new Item();
                param.setSku(skuId);
                List<Item> list = this.itemService.findAll(param);
                //判斷是否查詢到結果
                if (list.size() > 0) {
                    //若是有結果,表示商品已下載,進行下一次遍歷
                    continue;
                }

                //保存商品數據,聲明商品對象
                Item item = new Item();

                //商品spu
                item.setSpu(spuId);
                //商品sku
                item.setSku(skuId);
                //商品url地址
                item.setUrl("https://item.jd.com/" + skuId + ".html");
                //建立時間
                item.setCreated(new Date());
                //修改時間
                item.setUpdated(item.getCreated());


                //獲取商品標題
                String itemHtml = this.httpUtils.getHtml(item.getUrl());
                String title = Jsoup.parse(itemHtml).select("div.sku-name").text();
                item.setTitle(title);

                //獲取商品價格
                String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuId;
                String priceJson = this.httpUtils.getHtml(priceUrl);
                //解析json數據獲取商品價格
                double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                item.setPrice(price);

                //獲取圖片地址
                String pic = "https:" + skuEle.attr("data-lazy-img").replace("/n9/","/n1/");
                System.out.println(pic);
                //下載圖片
                String picName = this.httpUtils.getImage(pic);
                item.setPic(picName);

                //保存商品數據
                this.itemService.save(item);
            }
        }
    }
}

3.結果

4.總結全文

在寫代碼代碼的時候遇到一下的錯誤,首先打開了京東的要抓取的頁面url="xxxxxx",可是在抓取的時候老是抓取不到數據 循環裏面的spus老是爲0,通過我多方查找信息,把上面的數據修改成一下代碼,就成功了

package cn.itboxue.jd.task;

import cn.itboxue.jd.pojo.Item;
import cn.itboxue.jd.serivce.ItemService;
import cn.itboxue.jd.util.HttpUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.Date;
import java.util.List;

@Component
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ItemService itemService;

    private static final ObjectMapper MAPPER =  new ObjectMapper();


    //當下載任務完成後,間隔多長時間進行下一次的任務。
    @Scheduled(fixedDelay = 100 * 1000)
    public void itemTask() throws Exception {
        //聲明須要解析的初始地址
/*
        https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq%22%20+%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page=
*/
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq%22%20+%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page=";

        //按照頁面對手機的搜索結果進行遍歷解析
        for (int i = 1; i < 100; i = i + 2) {
            Document doc = Jsoup.connect(url+i).maxBodySize(0).get();
            //doc獲取整個頁面的全部數據
            Elements ulList = doc.select("ul[class='gl-warp clearfix']");
            Elements liList = ulList.select("li[class='gl-item']");

            this.parse(liList);
        }


        System.out.println("手機數據抓取完成!");


    }

    //解析頁面,獲取商品數據並存儲
    private void parse( Elements liList) throws Exception {
        //解析html獲取Document
       // Document doc = Jsoup.parse(html);
        //System.out.print(doc);
        //獲取spu信息
        //Elements elements = doc.select("div#J_goodsList > ul > li");
        //System.out.println(elements+"是否有數據");

        for (Element spuEle : liList) {
            //獲取spu
            long spu = Long.parseLong(spuEle.attr("data-spu"));

            //獲取sku信息
            Elements skuEles = spuEle.select("li.ps-item");

            for (Element skuEle : skuEles) {
                //獲取sku
                long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));

                //根據sku查詢商品數據
                Item item = new Item();
                item.setSku(sku);
                List<Item> list = this.itemService.findAll(item);

                if(list.size()>0) {
                    //若是商品存在,就進行下一個循環,該商品不保存,由於已存在
                    continue;
                }

                //設置商品的spu
                item.setSpu(spu);

                //獲取商品的詳情的url
                String itemUrl = "https://item.jd.com/" + sku + ".html";
                item.setUrl(itemUrl);


                //獲取商品的圖片
                String picUrl ="https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                picUrl = picUrl.replace("/n9/","/n1/");
                String picName = this.httpUtils.doGetImage(picUrl);
                item.setPic(picName);

                //獲取商品的價格
                String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                item.setPrice(price);


                //獲取商品的標題
                String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
                String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                item.setTitle(title);


                item.setCreated(new Date());
                item.setUpdated(item.getCreated());

                //保存商品數據到數據庫中
                this.itemService.save(item);

            }
        }
    }

}

今天的筆記就作到這裏吧,但願之後的想學習java爬蟲的愛好者少走彎路,謝謝。。。。。。。

2019-04-1921:07:56

做者:何秀好

相關文章
相關標籤/搜索