最近幾天很無聊,學習了一下java的爬蟲,寫一些本身在作這個案例的過程當中遇到的問題和一些體會
1.學習目標
練習爬取京東的數據,圖片+價格+標題等等html
2.學習過程java
1·開發工具
JDK1.8
IntelliJ IDEA
IDEA自帶的Mavennode
2.使用技術
Spring Boot+Spring Data JPA
3.數據庫準備mysql
CREATE TABLE `jd_item` ( `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主鍵id', `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id', `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品類單元id', `title` varchar(100) DEFAULT NULL COMMENT '商品標題', `price` bigint(10) DEFAULT NULL COMMENT '商品價格', `pic` varchar(200) DEFAULT NULL COMMENT '商品圖片', `url` varchar(200) DEFAULT NULL COMMENT '商品詳情地址', `created` datetime DEFAULT NULL COMMENT '建立時間', `updated` datetime DEFAULT NULL COMMENT '更新時間', PRIMARY KEY (`id`), KEY `sku` (`sku`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京東商品表';
4.添加依賴web
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.0.2.RELEASE</version> </parent> <groupId>cn.itcast.crawler</groupId> <artifactId>itcast-crawler-jd</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <!--SpringMVC--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <!--SpringData Jpa--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-jpa</artifactId> </dependency> <!--MySQL鏈接包--> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> </dependency> <!-- HttpClient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> </dependency> <!--Jsoup--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <!--工具包--> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> </dependencies> </project>
5.添加配置文件spring
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler
spring.datasource.username=root
spring.datasource.password=root
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
6.代碼實現sql
1.pojo數據庫
@Entity @Table(name = "jd_item") public class Item { //主鍵 @Id @GeneratedValue(strategy = GenerationType.IDENTITY) private Long id; //標準產品單位(商品集合) private Long spu; //庫存量單位(最小品類單元) private Long sku; //商品標題 private String title; //商品價格 private Double price; //商品圖片 private String pic; //商品詳情地址 private String url; //建立時間 private Date created; //更新時間 private Date updated; set/get }
2.編寫daoapache
public interface ItemDao extends JpaRepository<Item,Long> { }
3.編寫service接口json
public interface ItemService { //根據條件查詢數據 public List<Item> findAll(Item item); //保存數據 public void save(Item item); }
4.ItemServiceImpl實現類
@Service public class ItemServiceImpl implements ItemService { @Autowired private ItemDao itemDao; @Override public List<Item> findAll(Item item) { Example example = Example.of(item); List list = this.itemDao.findAll(example); return list; } @Override @Transactional public void save(Item item) { this.itemDao.save(item); } }
5.編寫引導類
@SpringBootApplication //設置開啓定時任務 @EnableScheduling public class Application { public static void main(String[] args) { SpringApplication.run(Application.class, args); } }
6. 封裝HttpClient
@Component public class HttpUtils { private PoolingHttpClientConnectionManager cm; public HttpUtils() { this.cm = new PoolingHttpClientConnectionManager(); // 設置最大鏈接數 cm.setMaxTotal(200); // 設置每一個主機的併發數 cm.setDefaultMaxPerRoute(20); } //獲取內容 public String getHtml(String url) { // 獲取HttpClient對象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); // 聲明httpGet請求對象 HttpGet httpGet = new HttpGet(url); // 設置請求參數RequestConfig httpGet.setConfig(this.getConfig()); CloseableHttpResponse response = null; try { // 使用HttpClient發起請求,返回response response = httpClient.execute(httpGet); // 解析response返回數據 if (response.getStatusLine().getStatusCode() == 200) { String html = ""; // 若是response。getEntity獲取的結果是空,在執行EntityUtils.toString會報錯 // 須要對Entity進行非空的判斷 if (response.getEntity() != null) { html = EntityUtils.toString(response.getEntity(), "UTF-8"); } return html; } } catch (Exception e) { e.printStackTrace(); } finally { try { if (response != null) { // 關閉鏈接 response.close(); } // 不能關閉,如今使用的是鏈接管理器 // httpClient.close(); } catch (Exception e) { e.printStackTrace(); } } return null; } //獲取圖片 public String getImage(String url) { // 獲取HttpClient對象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); // 聲明httpGet請求對象 HttpGet httpGet = new HttpGet(url); // 設置請求參數RequestConfig httpGet.setConfig(this.getConfig()); CloseableHttpResponse response = null; try { // 使用HttpClient發起請求,返回response response = httpClient.execute(httpGet); // 解析response下載圖片 if (response.getStatusLine().getStatusCode() == 200) { // 獲取文件類型 String extName = url.substring(url.lastIndexOf(".")); // 使用uuid生成圖片名 String imageName = UUID.randomUUID().toString() + extName; // 聲明輸出的文件 OutputStream outstream = new FileOutputStream(new File("D:/images/" + imageName)); // 使用響應體輸出文件 response.getEntity().writeTo(outstream); // 返回生成的圖片名 return imageName; } } catch (Exception e) { e.printStackTrace(); } finally { try { if (response != null) { // 關閉鏈接 response.close(); } // 不能關閉,如今使用的是鏈接管理器 // httpClient.close(); } catch (Exception e) { e.printStackTrace(); } } return null; } //獲取請求參數對象 private RequestConfig getConfig() { RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 設置建立鏈接的超時時間 .setConnectionRequestTimeout(500) // 設置獲取鏈接的超時時間 .setSocketTimeout(10000) // 設置鏈接的超時時間 .build(); return config; } }
7. 實現數據抓取
@Component public class ItemTask { @Autowired private HttpUtils httpUtils; @Autowired private ItemService itemService; public static final ObjectMapper MAPPER = new ObjectMapper(); //設置定時任務執行完成後,再間隔100秒執行一次 @Scheduled(fixedDelay = 1000 * 100) public void process() throws Exception { //分析頁面發現訪問的地址,頁碼page從1開始,下一頁oage加2 String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&s=5760&click=0&page="; //遍歷執行,獲取全部的數據 for (int i = 1; i < 10; i = i + 2) { //發起請求進行訪問,獲取頁面數據,先訪問第一頁 String html = this.httpUtils.getHtml(url + i); //解析頁面數據,保存數據到數據庫中 this.parseHtml(html); } System.out.println("執行完成"); } //解析頁面,並把數據保存到數據庫中 private void parseHtml(String html) throws Exception { //使用jsoup解析頁面 Document document = Jsoup.parse(html); //獲取商品數據 Elements spus = document.select("div#J_goodsList > ul > li"); //遍歷商品spu數據 for (Element spuEle : spus) { //獲取商品spu Long spuId = Long.parseLong(spuEle.attr("data-spu")); //獲取商品sku數據 Elements skus = spuEle.select("li.ps-item img"); for (Element skuEle : skus) { //獲取商品sku Long skuId = Long.parseLong(skuEle.attr("data-sku")); //判斷商品是否被抓取過,能夠根據sku判斷 Item param = new Item(); param.setSku(skuId); List<Item> list = this.itemService.findAll(param); //判斷是否查詢到結果 if (list.size() > 0) { //若是有結果,表示商品已下載,進行下一次遍歷 continue; } //保存商品數據,聲明商品對象 Item item = new Item(); //商品spu item.setSpu(spuId); //商品sku item.setSku(skuId); //商品url地址 item.setUrl("https://item.jd.com/" + skuId + ".html"); //建立時間 item.setCreated(new Date()); //修改時間 item.setUpdated(item.getCreated()); //獲取商品標題 String itemHtml = this.httpUtils.getHtml(item.getUrl()); String title = Jsoup.parse(itemHtml).select("div.sku-name").text(); item.setTitle(title); //獲取商品價格 String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuId; String priceJson = this.httpUtils.getHtml(priceUrl); //解析json數據獲取商品價格 double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble(); item.setPrice(price); //獲取圖片地址 String pic = "https:" + skuEle.attr("data-lazy-img").replace("/n9/","/n1/"); System.out.println(pic); //下載圖片 String picName = this.httpUtils.getImage(pic); item.setPic(picName); //保存商品數據 this.itemService.save(item); } } } }
3.結果
4.總結全文
在寫代碼代碼的時候遇到一下的錯誤,首先打開了京東的要抓取的頁面url="xxxxxx",可是在抓取的時候老是抓取不到數據 循環裏面的spus老是爲0,通過我多方查找信息,把上面的數據修改成一下代碼,就成功了
package cn.itboxue.jd.task; import cn.itboxue.jd.pojo.Item; import cn.itboxue.jd.serivce.ItemService; import cn.itboxue.jd.util.HttpUtils; import com.fasterxml.jackson.databind.ObjectMapper; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import java.util.Date; import java.util.List; @Component public class ItemTask { @Autowired private HttpUtils httpUtils; @Autowired private ItemService itemService; private static final ObjectMapper MAPPER = new ObjectMapper(); //當下載任務完成後,間隔多長時間進行下一次的任務。 @Scheduled(fixedDelay = 100 * 1000) public void itemTask() throws Exception { //聲明須要解析的初始地址 /* https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq%22%20+%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page= */ String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq%22%20+%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page="; //按照頁面對手機的搜索結果進行遍歷解析 for (int i = 1; i < 100; i = i + 2) { Document doc = Jsoup.connect(url+i).maxBodySize(0).get(); //doc獲取整個頁面的全部數據 Elements ulList = doc.select("ul[class='gl-warp clearfix']"); Elements liList = ulList.select("li[class='gl-item']"); this.parse(liList); } System.out.println("手機數據抓取完成!"); } //解析頁面,獲取商品數據並存儲 private void parse( Elements liList) throws Exception { //解析html獲取Document // Document doc = Jsoup.parse(html); //System.out.print(doc); //獲取spu信息 //Elements elements = doc.select("div#J_goodsList > ul > li"); //System.out.println(elements+"是否有數據"); for (Element spuEle : liList) { //獲取spu long spu = Long.parseLong(spuEle.attr("data-spu")); //獲取sku信息 Elements skuEles = spuEle.select("li.ps-item"); for (Element skuEle : skuEles) { //獲取sku long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku")); //根據sku查詢商品數據 Item item = new Item(); item.setSku(sku); List<Item> list = this.itemService.findAll(item); if(list.size()>0) { //若是商品存在,就進行下一個循環,該商品不保存,由於已存在 continue; } //設置商品的spu item.setSpu(spu); //獲取商品的詳情的url String itemUrl = "https://item.jd.com/" + sku + ".html"; item.setUrl(itemUrl); //獲取商品的圖片 String picUrl ="https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img"); picUrl = picUrl.replace("/n9/","/n1/"); String picName = this.httpUtils.doGetImage(picUrl); item.setPic(picName); //獲取商品的價格 String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku); double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble(); item.setPrice(price); //獲取商品的標題 String itemInfo = this.httpUtils.doGetHtml(item.getUrl()); String title = Jsoup.parse(itemInfo).select("div.sku-name").text(); item.setTitle(title); item.setCreated(new Date()); item.setUpdated(item.getCreated()); //保存商品數據到數據庫中 this.itemService.save(item); } } } }
今天的筆記就作到這裏吧,但願之後的想學習java爬蟲的愛好者少走彎路,謝謝。。。。。。。
2019-04-1921:07:56
做者:何秀好