爬蟲須要抓取網站價格,與通常抓取網頁區別的是抓取內容是經過AJAX加載,而且價格是經過CSS背景圖片顯示的。css
每個數字對應一個樣式,如'p_h57_5'html
.p_h57_5 { background: url('http://pic.c-ctrip.com/priceblur/h57/3713de5c594648529f39d031243966dd.gif') no-repeat -590px; padding: 0 6px; font-size: 18px; }
數字對應的樣式和對應的backgroundimg都是動態改變的,須要獲取到每個房型的房價。雖而後來有了其它渠道獲取房價,這裏記錄一下用Selenium&Emgu抓取的方式。
流程:c#
1.Selenium訪問網址 2.全屏截圖 3.Selenium選擇器獲取房型等信息 4.Selenium選擇器獲取價格DOM元素,計算出價格元素的相對位置,截取價格圖片,使用Emgu識別價格而且輸出
static void Main(string[] args) { //訪問網址 ChromeOptions options = new ChromeOptions(); options.AddArguments("--start-maximized --disable-popup-blocking"); var driver = new ChromeDriver(options); driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html"); try { new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until( ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加載完畢 } finally { } //刪除價格的¥符號 ReadOnlyCollection<IWebElement> elementsList = driver.FindElementsByCssSelector("tr[expand]"); driver.ExecuteScript(@" var arr = document.getElementsByTagName('dfn'); for(var i=0;i<arr.length;i++){ arr[i].style.display = 'none'; } "); //全屏截圖 var image2 = GetEntereScreenshot(driver); image2.Save(@"Z:\111.jpg"); //輸出 Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "類型", "房價"); foreach (IWebElement _ in elementsList) { //var image = _.Snapshot(); //image.Save(@"Z:\" + Guid.NewGuid() + ".jpg"); //var str = ORC_((Bitmap)image); var roomType = ""; try { roomType = _.FindElement(By.CssSelector(".room_unfold")).Text; } catch (Exception) { } var roomTypeText = regRoomType.Match(roomType); var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text; //價格元素生成圖片 var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2); //識別 var price = ORC_((Bitmap)image); Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price); } Console.Read(); }
圖片識別方法windows
static Program() { _ocr.SetVariable("tessedit_char_whitelist", "0123456789"); } private static Tesseract _ocr = new Tesseract(@"C:\Emgu\emgucv-windows-universal-cuda 2.9.0.1922\bin\tessdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED); //傳入圖片進行識別 public static string ORC_(Bitmap img) { //""標示OCR識別調用失敗 string re = ""; if (img == null) return re; else { Bgr drawColor = new Bgr(Color.Blue); try { Image<Bgr, Byte> image = new Image<Bgr, byte>(img); using (Image<Gray, byte> gray = image.Convert<Gray, Byte>()) { _ocr.Recognize(gray); Tesseract.Charactor[] charactors = _ocr.GetCharactors(); foreach (Tesseract.Charactor c in charactors) { image.Draw(c.Region, drawColor, 1); } re = _ocr.GetText(); } return re; } catch (Exception ex) { return re; } } }
Selenium內置了截圖方法,只能截取瀏覽器中顯示的內容,找到一個全屏截圖的方式(內置截圖+控制滾動條,圖片拼接)瀏覽器
public static Bitmap GetEntereScreenshot(IWebDriver _driver) { Bitmap stitchedImage = null; try { long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth"); long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.parentNode.scrollHeight"); int totalWidth = (int)totalwidth1; int totalHeight = (int)totalHeight1; // Get the Size of the Viewport long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth"); long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth"); int viewportWidth = (int)viewportWidth1; int viewportHeight = (int)viewportHeight1; // Split the Screen in multiple Rectangles List<Rectangle> rectangles = new List<Rectangle>(); // Loop until the Total Height is reached for (int i = 0; i < totalHeight; i += viewportHeight) { int newHeight = viewportHeight; // Fix if the Height of the Element is too big if (i + viewportHeight > totalHeight) { newHeight = totalHeight - i; } // Loop until the Total Width is reached for (int ii = 0; ii < totalWidth; ii += viewportWidth) { int newWidth = viewportWidth; // Fix if the Width of the Element is too big if (ii + viewportWidth > totalWidth) { newWidth = totalWidth - ii; } // Create and add the Rectangle Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight); rectangles.Add(currRect); } } // Build the Image stitchedImage = new Bitmap(totalWidth, totalHeight); // Get all Screenshots and stitch them together Rectangle previous = Rectangle.Empty; foreach (var rectangle in rectangles) { // Calculate the Scrolling (if needed) if (previous != Rectangle.Empty) { int xDiff = rectangle.Right - previous.Right; int yDiff = rectangle.Bottom - previous.Bottom; // Scroll //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff)); ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff)); System.Threading.Thread.Sleep(200); } // Take Screenshot var screenshot = ((ITakesScreenshot)_driver).GetScreenshot(); // Build an Image out of the Screenshot Image screenshotImage; using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray)) { screenshotImage = Image.FromStream(memStream); } // Calculate the Source Rectangle Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height); // Copy the Image using (Graphics g = Graphics.FromImage(stitchedImage)) { g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel); } // Set the Previous Rectangle previous = rectangle; } } catch (Exception ex) { // handle } return stitchedImage; }
最後的是根據傳入的元素和全屏截圖,獲取到價格元素的圖片oop
public static Image SnapshotV2(this IWebElement element, Bitmap bitmap) { Size size = new Size( Math.Min(element.Size.Width, bitmap.Width), Math.Min(element.Size.Height, bitmap.Height)); Rectangle crop = new Rectangle(element.Location, size); return bitmap.Clone(crop, bitmap.PixelFormat); }
運行效果以下
網站