新蛋詳情頁的價格字段是用圖片顯示的,雖然其它電商都已經認識到這是沒什麼卵用還浪費資源的行爲但貌似新蛋不這樣認爲,因此嘗試爬取一下。html
價格字段大概是這個樣子:java
這個圖片也是很純淨的可以識別率百分百的。node
仍是上以前寫的一個小小工具庫:https://github.com/CC11001100/commons-simple-character-ocrgit
首先須要抓取一些圖片來生成標註數據,這裏選擇了智能手機下的前十頁,將前十頁商品的價格字段圖片爬下來生成標註數據,代碼以下:
github
package org.cc11001100.t1; import cc11001100.ocr.OcrUtil; import org.apache.http.client.fluent.Request; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * 新蛋: http://www.newegg.cn/ * * @author CC11001100 */ public class NeweggCrawler { private static OcrUtil ocrUtil; static { ocrUtil = new OcrUtil(); } /** * 在智能手機類下面收集一些圖片 */ public static void grabTrainImage(String saveBasePath) { ExecutorService executorService = Executors.newFixedThreadPool(10); String url = "http://www.newegg.cn/SubCategory/1043-%d.htm"; for (int i = 1; i <= 10; i++) { Document doc = getDocument(String.format(url, i)); doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> { executorService.execute(() -> { String detailPageUrl = detailPageLinkElt.attr("href"); Document detailPage = getDocument(detailPageUrl); // 原類名即如此... String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src"); byte[] imgBytes = download(imgLink); try { BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes)); String savePath = saveBasePath + UUID.randomUUID().toString() + ".png"; ImageIO.write(img, "png", new File(savePath)); } catch (IOException e) { e.printStackTrace(); } }); }); } try { executorService.shutdown(); executorService.awaitTermination(10, TimeUnit.DAYS); } catch (InterruptedException e) { e.printStackTrace(); } } private static Document getDocument(String url) { byte[] responseBytes = download(url); String html = null; try { html = new String(responseBytes, "gb2312"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return Jsoup.parse(html); } private static byte[] download(String url) { for (int i = 0; i < 3; i++) { try { return Request.Get(url).execute().returnContent().asBytes(); } catch (IOException e) { e.printStackTrace(); } } return new byte[0]; } public static void main(String[] args) { grabTrainImage("E:/test/crawler/newegg/raw/"); new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/"); } }
全部的價格圖片都是由下面這些字符組成的:apache
手動將每張圖片的文件名修改成圖片所表示的意思:json
時間有限,只增長一個測試方法用來測試結果是否正確,完整代碼以下:dom
package org.cc11001100.t1; import cc11001100.ocr.OcrUtil; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.apache.http.client.fluent.Request; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * 新蛋: http://www.newegg.cn/ * * @author CC11001100 */ public class NeweggCrawler { private static OcrUtil ocrUtil; static { ocrUtil = new OcrUtil(); ocrUtil.loadDictionaryMap("E:/test/crawler/newegg/char/"); } /** * 在智能手機類下面收集一些圖片 */ public static void grabTrainImage(String saveBasePath) { ExecutorService executorService = Executors.newFixedThreadPool(10); String url = "http://www.newegg.cn/SubCategory/1043-%d.htm"; for (int i = 1; i <= 10; i++) { Document doc = getDocument(String.format(url, i)); doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> { executorService.execute(() -> { String detailPageUrl = detailPageLinkElt.attr("href"); Document detailPage = getDocument(detailPageUrl); // 原類名即如此... String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src"); byte[] imgBytes = download(imgLink); try { BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes)); String savePath = saveBasePath + UUID.randomUUID().toString() + ".png"; ImageIO.write(img, "png", new File(savePath)); } catch (IOException e) { e.printStackTrace(); } }); }); } try { executorService.shutdown(); executorService.awaitTermination(10, TimeUnit.DAYS); } catch (InterruptedException e) { e.printStackTrace(); } } /** * 測試抓取結果是否正確 * * @param detailPageUrl * @return */ public static JSONObject parse(String detailPageUrl) { JSONObject product = new JSONObject(); Document doc = getDocument(detailPageUrl); String imgLink = doc.select(".godds_info_data img[src~=PriceImage]").attr("src"); byte[] imgBytes = download(imgLink); BufferedImage img = null; try { img = ImageIO.read(new ByteArrayInputStream(imgBytes)); double price = Double.parseDouble(ocrUtil.ocr(img)); product.put("price", price); } catch (IOException e) { e.printStackTrace(); } String productTitle = doc.select("#productTitle").text(); product.put("title", productTitle); return product; } private static Document getDocument(String url) { byte[] responseBytes = download(url); String html = null; try { html = new String(responseBytes, "gb2312"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return Jsoup.parse(html); } private static byte[] download(String url) { for (int i = 0; i < 3; i++) { try { return Request.Get(url).execute().returnContent().asBytes(); } catch (IOException e) { e.printStackTrace(); } } return new byte[0]; } public static void main(String[] args) { // grabTrainImage("E:/test/crawler/newegg/raw/"); // new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/"); String url = "http://www.newegg.cn/Product/A28-032-7Q5.htm"; System.out.println(JSON.toJSONString(parse(url), true)); } }