仍是直接貼代碼說明比較實在。
感受webmagic-selenium這個模塊有點雞肋,但仍是有可借鑑之處。借鑑它寫了一個SeleniumDownloader,以下:css
import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.util.Map; /** * @author taojw * */ public class SeleniumDownloader implements Downloader{ private static final Logger log=LoggerFactory.getLogger(SeleniumDownloader.class); private int sleepTime=3000;//3s private SeleniumAction action=null; private WebDriverPool webDriverPool=new WebDriverPool(); public SeleniumDownloader(){ } public SeleniumDownloader(int sleepTime,WebDriverPool pool){ this(sleepTime,pool,null); } public SeleniumDownloader(int sleepTime,WebDriverPool pool,SeleniumAction action){ this.sleepTime=sleepTime; this.action=action; if(pool!=null){ webDriverPool=pool; } } public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } public void setOperator(SeleniumAction action){ this.action=action; } @Override public Page download(Request request, Task task) { WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { log.warn("interrupted", e); return null; } log.info("downloading page " + request.getUrl()); Page page = new Page(); try { webDriver.get(request.getUrl()); Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } catch (Exception e) { webDriverPool.close(webDriver); page.setSkip(true); return page; } // WindowUtil.changeWindow(webDriver); WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } manage.window().maximize(); if(action!=null){ action.execute(webDriver); } SeleniumAction reqAction=(SeleniumAction) request.getExtra("action"); if(reqAction!=null){ reqAction.execute(webDriver); } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); page.setRawText(content); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, webDriver.getCurrentUrl()))); page.setUrl(new PlainText(webDriver.getCurrentUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; } @Override public void setThread(int thread) { } }
功能:
支持在Spider.setDownloader的時候添加鉤子SeleniumAction來實現自定義selenium的通用操做。增強了靈活性
支持對每一個請求添加action參數,參數值爲SeleniumAction對象,進而能夠對每一個請求實現自定義selenium操做.增強了靈活性html
import org.openqa.selenium.WebDriver; /** * @author taojw * */ public interface SeleniumAction { void execute(WebDriver driver); }
WebDriverPool實現:注意對WebDriver的池化來保證性能
也是參考webmagic-selenium做了些修改。java
import com.fh.util.FileUtil; import org.openqa.selenium.WebDriver; import org.openqa.selenium.phantomjs.PhantomJSDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; /** * @author taojw */ public class WebDriverPool { private Logger logger = LoggerFactory.getLogger(getClass()); private int CAPACITY = 5; private AtomicInteger refCount = new AtomicInteger(0); private static final String DRIVER_PHANTOMJS = "phantomjs"; /** * store webDrivers available */ private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>( CAPACITY); private static String PHANTOMJS_PATH; private static DesiredCapabilities caps = DesiredCapabilities.phantomjs(); static { PHANTOMJS_PATH = FileUtil.getCommonProp("phantomjs.path"); caps.setJavascriptEnabled(true); caps.setCapability( PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, PHANTOMJS_PATH); caps.setCapability("takesScreenshot", true); caps.setCapability( PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX + "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, "--load-images=no"); } public WebDriverPool() { } public WebDriverPool(int poolsize) { this.CAPACITY = poolsize; innerQueue = new LinkedBlockingDeque<WebDriver>(poolsize); } public WebDriver get() throws InterruptedException { WebDriver poll = innerQueue.poll(); if (poll != null) { return poll; } if (refCount.get() < CAPACITY) { synchronized (innerQueue) { if (refCount.get() < CAPACITY) { WebDriver mDriver = new PhantomJSDriver(caps); // 嘗試性解決:https://github.com/ariya/phantomjs/issues/11526問題 mDriver.manage().timeouts() .pageLoadTimeout(60, TimeUnit.SECONDS); // mDriver.manage().window().setSize(new Dimension(1366, // 768)); innerQueue.add(mDriver); refCount.incrementAndGet(); } } } return innerQueue.take(); } public void returnToPool(WebDriver webDriver) { // webDriver.quit(); // webDriver=null; innerQueue.add(webDriver); } public void close(WebDriver webDriver) { refCount.decrementAndGet(); webDriver.close(); webDriver.quit(); webDriver = null; } public void shutdown() { try { for (WebDriver driver : innerQueue) { close(driver); } innerQueue.clear(); } catch (Exception e) { // e.printStackTrace(); logger.warn("webdriverpool關閉失敗",e); } } }
修改後:
僅支持PhantomJS做爲瀏覽器驅動。
增長phantomjs相關配置
修改隊列大小控制邏輯git
WindowUtil
注意這個loadAll方法的實現很巧妙哦,因爲涉及滾動加載頁面的時候,若是一會兒滾到底部可能會形成中間部分沒有加載出來,這樣就不得不針對每一個頁面進行滿滿滾動。而loadAll採起的思路是直接獲取頁面可滾動大小,而後將瀏覽器窗口調成對應大小,刷新以後全部內容便加載出來了。github
import org.apache.commons.io.FileUtils; import org.openqa.selenium.*; import java.io.File; import java.io.IOException; /** * @author taojw * */ public class WindowUtil { /** * 滾動窗口。 * @param driver * @param height */ public static void scroll(WebDriver driver,int height){ ((JavascriptExecutor)driver).executeScript("window.scrollTo(0,"+height+" );"); } /** * 從新調整窗口大小,以適應頁面,須要耗費必定時間。建議等待合理的時間。 * @param driver */ public static void loadAll(WebDriver driver){ Dimension od=driver.manage().window().getSize(); int width=driver.manage().window().getSize().width; //嘗試性解決:https://github.com/ariya/phantomjs/issues/11526問題 driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS); long height=(Long)((JavascriptExecutor)driver).executeScript("return document.body.scrollHeight;"); driver.manage().window().setSize(new Dimension(width, (int)height)); driver.navigate().refresh(); } public static void taskScreenShot(WebDriver driver,File saveFile){ File src=((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); try { FileUtils.copyFile(src, saveFile); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void changeWindow(WebDriver driver){ // 獲取當前頁面句柄 String handle = driver.getWindowHandle(); // 獲取全部頁面的句柄,並循環判斷不是當前的句柄,就作選取switchTo() for (String handles : driver.getWindowHandles()) { if (handles.equals(handle)) continue; driver.switchTo().window(handles); } } }
至此對爬蟲框架的擴展高一段落。web
/** * 店鋪銷售信息 * * @author taojw */ @Scope("prototype") @Component public class TaoBaoShopInfoProcessor implements PageProcessor { private static final Logger log = LoggerFactory .getLogger(TaoBaoShopInfoProcessor.class); @Autowired private TaoBaoShopInfoService service; private Site site = Site .me() .setCharset("UTF-8") .setCycleRetryTimes(3) .setSleepTime(3 * 1000) .addHeader("Connection", "keep-alive") .addHeader("Cache-Control", "max-age=0") .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"); private AtomicBoolean isPageAdd = new AtomicBoolean(false); private static AtomicBoolean running = new AtomicBoolean(false); private WebDriverPool pool=new WebDriverPool(); @Override public Site getSite() { return this.site; } @Override public void process(Page page) { if (islistPage(page)) { List<String> urls = page.getHtml() .$("dl.item a.J_TGoldData", "href").all(); List<String> targetUrls = new ArrayList<String>(); for (String url : urls) { targetUrls.add(url.trim()); } page.addTargetRequests(targetUrls); if (isPageAdd.compareAndSet(false, true)) { // 分頁處理 String pageinfo = page.getHtml() .$(".pagination .page-info", "text").get(); int pageCount = Integer.valueOf(pageinfo.split("/")[1]); String cururl = page.getUrl().get(); //只抓前5頁 if(pageCount>5){ pageCount=5; } for (int i = 1; i < pageCount; i++) { String tmp = cururl + "&pageNo=" + (i + 1); page.addTargetRequest(tmp); } } return; } // 商品頁面 String curUrl = page.getUrl().get(); boolean isTaoBao=curUrl.startsWith("https://item.taobao.com"); boolean isTmall=curUrl.startsWith("https://detail.tmall.com"); String tmpspm = curUrl.split("\\?")[1].split("&")[0]; // spm碼 String spm = tmpspm.split("=")[1]; // 網店地址 String shopUrl=""; // 商品名稱 String name=""; // 價格 double price =0; // 30天交易總數 int sellCount=0; // 交易總價 double allPrice=0; if(isTaoBao){ shopUrl= page.getHtml() .xpath("//div[@class='tb-shop-name']/dl/dd/strong/a/@href") .get(); shopUrl = shopUrl.split("\\?")[0]; name = page.getHtml().xpath("//*[@id='J_Title']/h3/text()") .get(); try{ price=Double.valueOf(page.getHtml() .$("#J_PromoPriceNum", "text").get().split("-")[0].trim()); }catch(Exception e){ price=Double.valueOf(page.getHtml() .$("#J_StrPrice .tb-rmb-num", "text").get().split("-")[0].trim()); } sellCount = Integer.valueOf(page.getHtml() .$("#J_SellCounter", "text").get()); allPrice = Double.valueOf(price) * Double.valueOf(sellCount); }else if(isTmall){ shopUrl= page.getHtml() .xpath("//*[@id='side-shop-info']/div/h3/div/a/@href") .get(); shopUrl = shopUrl.split("\\?")[0]; name = page.getHtml().$(".tb-detail-hd h1","text") .get().trim(); price=Double.valueOf(page.getHtml() .$(".tm-price", "text").get().split("-")[0].trim()); sellCount = Integer.valueOf(page.getHtml() .$(".tm-count", "text").get().trim()); allPrice = Double.valueOf(price) * Double.valueOf(sellCount); } // 採集日期 // Timestamp recordDate=new Timestamp(new Date().getTime()); String recordDate = DateUtil.formatDate(new Date(), "yyyy-MM-dd"); log.debug(shopUrl + ":" + spm + ":" + name + ":" + price + ":" + sellCount + ":" + allPrice + ":" + recordDate); PageData pd = new PageData(); pd.put("id", UUID.randomUUID().toString()); pd.put("shopUrl", shopUrl); pd.put("spm", spm); pd.put("name", name); pd.put("price", price); pd.put("sellCount", sellCount); pd.put("allPrice", allPrice); pd.put("recordDate", recordDate); service.saveData(pd); } private boolean islistPage(Page page) { String tmp = page.getHtml().$("#J_PromoPrice").get(); if (StringUtils.isBlank(tmp)) { return true; } return false; } public void start() { if (running.compareAndSet(false, true)) { try { service.emptyTable(); List<String> urls = service.getShopUrl(); if (urls == null) { log.error("店鋪url獲取異常,終止抓取"); } String[] urlStrs=null; int size=50; // int size=urls.size(); if(urls.size()<size){ urlStrs=new String[urls.size()]; urlStrs=urls.toArray(urlStrs); }else{ urlStrs=new String[size]; for(int i=0;i<size;i++){ urlStrs[i]=urls.get(i).trim(); } } log.info("準備抓取,須要抓取的店鋪數爲{}", urls.size()); // String[] urlStrs = new String[urls.size()]; // "https://zhuzhuwo.taobao.com" urls.toArray(urlStrs) Spider spider = Spider.create(this) .setDownloader(new SeleniumDownloader(5000, pool, new TestAction())) .addUrl(urlStrs); // .addUrl("https://zhuzhuwo.taobao.com"); spider.thread(5).run(); log.info("淘寶店鋪銷售信息數據正常抓取完畢"); } finally { log.info("淘寶店鋪銷售信息數據抓取完畢,準備關閉webdriverpool"); pool.shutdown(); log.info("webdriverpool關閉完畢"); running.set(false); } } } public static void main(String[] args) { new TaoBaoShopInfoProcessor().start(); } private class TestAction implements SeleniumAction { @Override public void execute(WebDriver driver) { WebDriverWait wait = new WebDriverWait(driver, 10); // 商品頁,避免加載過多無用圖片信息。 if (driver.getCurrentUrl().startsWith("https://item.taobao.com/")|| driver.getCurrentUrl().startsWith("https://detail.tmall.com")) { // wait.until(ExpectedConditions.presenceOfElementLocated(By // .cssSelector("#J_PromoPriceNum"))); return; } // 店鋪首頁,點擊全部分類 if ((!(driver.getCurrentUrl().startsWith("https://item.taobao.com"))) && !(driver.getCurrentUrl().startsWith("https://detail.tmall.com")) && (!driver.getCurrentUrl().contains("search"))) { WebElement allcate = driver.findElement(By .cssSelector(".all-cats-trigger a")); Actions action = new Actions(driver); action.click(allcate).perform(); try { Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } } String url = driver.getCurrentUrl(); // 列表頁,加載全部 WindowUtil.loadAll(driver); url = driver.getCurrentUrl(); try { Thread.sleep(3000); // WindowUtil.taskScreenShot(driver, new File("d:\\data\\tb\\" + UUID.randomUUID().toString() + ".png")); // wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector(".pagination .page-info"))); } catch (InterruptedException e) { e.printStackTrace(); } } } }
因爲貓眼票房數據採用加密字體圖標,並且每一個數字對應的加密碼每次都變化。因此這次採用selenium加載頁面,截圖,摳圖(給每一個數字),考慮到貓眼票房數據的規則性,結合google的 Tesseract-OCR 訓練模型來識別咱們摳出來的數字圖片。apache
ImageUtil 負責摳圖json
import net.coobird.thumbnailator.Thumbnails; import net.coobird.thumbnailator.geometry.Position; import net.coobird.thumbnailator.geometry.Size; /** * @author taojw * */ public class ImageUtil { public static void crop(String srcfile,String destfile,ImageRegion region){ //指定座標 try { Thumbnails.of(srcfile) .sourceRegion(region.x, region.y, region.width, region.height) .size(region.width, region.height).outputQuality(1.0) //.keepAspectRatio(false) //不保持比例 .toFile(destfile); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) { crop("D:\\data\\111.png","D:\\data\\1112.png",new ImageRegion(66, 264, 422, 426)); } }
/** * @author taojw * */ public class ImageRegion { public int x; public int y; public int width; public int height; public ImageRegion(int x,int y,int width,int height){ this.x=x; this.y=y; this.width=width; this.height=height; } }
TesseractOcrUtil,調用tesseract進程,返回識別結果。api
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.UUID; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fh.util.FileUtil; /** * @author taojw * */ public class TesseractOcrUtil { private static final Logger log = LoggerFactory .getLogger(TesseractOcrUtil.class); private static final String tessPath; private static final String basePath; static { tessPath = FileUtil.getCommonProp("tesseract.path"); basePath = new File(tessPath).getParentFile().getAbsolutePath(); } public static String getByLangNum(String imagePath) { return get(imagePath, "num"); } public static String getByLangChi(String imagePath) { return get(imagePath, "chi_sim"); } public static String getByLangEng(String imagePath) { return get(imagePath, "eng"); } public static String get(String imagePath, String lang) { String outName = UUID.randomUUID().toString(); String outPath = basePath + File.separator + outName + ".txt"; // String cmd = tessPath + " " + imagePath + " " + outName + " -l " + lang; ProcessBuilder pb = new ProcessBuilder(); pb.directory(new File(basePath)); pb.command(tessPath,imagePath,outName,"-l",lang); pb.redirectErrorStream(true); Process process=null; String errormsg = ""; String res = null; try { process = pb.start(); // tesseract.exe 1.jpg 1 -l chi_sim int excode = process.waitFor(); if (excode == 0) { BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(outPath), "UTF-8")); res = in.readLine(); IOUtils.closeQuietly(in); } else { switch (excode) { case 1: errormsg = "Errors accessing files.There may be spaces in your image's filename."; break; case 29: errormsg = "Cannot recongnize the image or its selected region."; break; case 31: errormsg = "Unsupported image format."; break; default: errormsg = "Errors occurred."; } log.error("when ocr picture " + imagePath + " an error occured. " + errormsg); } } catch (IOException e) { e.printStackTrace(); log.warn("orc process occurs an io error",e); } catch (InterruptedException e) { e.printStackTrace(); log.warn("orc process was interrupt unexpected!",e); }finally{ FileUtils.deleteQuietly(new File(imagePath)); FileUtils.deleteQuietly(new File(outPath)); } if(res!=null){ res=res.trim(); } return res; } }
/** * @author taojw * */ public class MaoyanTest implements PageProcessor{ private static Site site=Site.me().setCharset("UTF-8").setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); @Override public Site getSite() { return site; } @Override public void process(Page page) { } public void start() { Spider cnSpider = Spider.create(this).setDownloader(new SeleniumDownloader(5000,null,new TestAction())) // .addUrl("https://shop34068488.taobao.com/?spm=a230r.7195193.1997079397.2.JLFlPa") // .addUrl("http://piaofang.maoyan.com/company/cinema?date=2017-01-18&webCityId=288&cityTier=0&page=1&cityName=%E6%8F%AD%E9%98%B3"); .addUrl("http://piaofang.maoyan.com/company/cinema?date=2017-01-18&webCityId=84&cityTier=0&page=1&cityName=%E4%BF%9D%E5%AE%9A"); // .addPipeline(new JsonFilePipeline("D:\\data\\webmagicfile.json")) //SpiderMonitor.instance().register(cnSpider); cnSpider.run(); } public static void main(String[] args) { new MaoyanTest().start(); } private class TestAction implements SeleniumAction{ @Override public void execute(WebDriver driver) { WindowUtil.loadAll(driver); try { Thread.sleep(5000); //WebDriverWait wait = new WebDriverWait(driver, 10); //wait.until(ExpectedConditions.presenceOfElementLocated(By.id("J_PromoPriceNum"))); File src=((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); String srcfile="D:\\data\\"+UUID.randomUUID().toString()+".png"; FileUtils.copyFile(src, new File(srcfile)); List<WebElement> movielist=driver.findElements(By.xpath("//*[@id='cinema-tbody']/tr")); // movielist.remove(0); for(int i=1;i<movielist.size();i++){ int index=i+1; String movieName=driver.findElement(By.xpath("//*[@id='cinema-tbody']/tr["+index+"]/td[2]")).getText(); String pattern = "//*[@id='cinema-tbody']/tr["+index+"]/td[3]"; WebElement tel=driver.findElement(By.xpath(pattern)); Point loc=tel.getLocation(); Dimension d=tel.getSize(); String cop_path="D:\\data\\crop\\current_piaofang_"+movieName+".png"; ImageUtil.crop(srcfile, cop_path, new ImageRegion(loc.x, loc.y, d.width+10, d.height)); System.out.println(TesseractOcrUtil.getByLangNum(cop_path)); FileUtils.deleteQuietly(new File(srcfile)); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
可供參考連接:
selenium系列文章:http://www.cnblogs.com/TankXi...
selenium api:http://seleniumhq.github.io/s...
tesseract-ocr樣本訓練: http://blog.csdn.net/firehood...
selenium多窗口切換:http://blog.csdn.net/meyoung0...xcode