Selenium&EmguCV實現爬蟲圖片識別

概述

爬蟲須要抓取網站價格,與通常抓取網頁區別的是抓取內容是經過AJAX加載,而且價格是經過CSS背景圖片顯示的。css

image

每個數字對應一個樣式,如'p_h57_5'html

.p_h57_5 {
background: url('http://pic.c-ctrip.com/priceblur/h57/3713de5c594648529f39d031243966dd.gif') no-repeat -590px;
padding: 0 6px;
font-size: 18px;
}

數字對應的樣式和對應的backgroundimg都是動態改變的,須要獲取到每個房型的房價。雖而後來有了其它渠道獲取房價,這裏記錄一下用Selenium&Emgu抓取的方式。
流程:c#

1.Selenium訪問網址
2.全屏截圖
3.Selenium選擇器獲取房型等信息
4.Selenium選擇器獲取價格DOM元素,計算出價格元素的相對位置,截取價格圖片,使用Emgu識別價格而且輸出

實現

static void Main(string[] args)
        {


            //訪問網址
            ChromeOptions options = new ChromeOptions();
            options.AddArguments("--start-maximized --disable-popup-blocking");
            var driver = new ChromeDriver(options);
            driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html");
   
       try
            {
                new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until(
                    ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加載完畢
            }
            finally
            {
                
            }
            
            //刪除價格的¥符號
            ReadOnlyCollection<IWebElement> elementsList = driver.FindElementsByCssSelector("tr[expand]");
            driver.ExecuteScript(@"
                var arr =  document.getElementsByTagName('dfn');
                for(var i=0;i<arr.length;i++){
                    arr[i].style.display = 'none';     
                }
            ");
            
            //全屏截圖
            var image2 = GetEntereScreenshot(driver);
            image2.Save(@"Z:\111.jpg");

            //輸出
            Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "類型", "房價");
            foreach (IWebElement _ in elementsList)
            {
                //var image = _.Snapshot();
                //image.Save(@"Z:\" + Guid.NewGuid() + ".jpg");
                //var str = ORC_((Bitmap)image);
                var roomType = "";
                try
                {
                    roomType = _.FindElement(By.CssSelector(".room_unfold")).Text;

                }
                catch (Exception)
                {
                }
                
                var roomTypeText = regRoomType.Match(roomType);
                
                var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text;

                //價格元素生成圖片
                var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2);
                //識別
                var price = ORC_((Bitmap)image);
                Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price);
            }
            Console.Read();
        }

圖片識別方法windows

static Program()
        {
            _ocr.SetVariable("tessedit_char_whitelist", "0123456789");
        }

        private static Tesseract _ocr = new Tesseract(@"C:\Emgu\emgucv-windows-universal-cuda 2.9.0.1922\bin\tessdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED);
        //傳入圖片進行識別
        public static string ORC_(Bitmap img)
        {
            //""標示OCR識別調用失敗
            string re = "";
            if (img == null)
                return re;
            else
            {


                Bgr drawColor = new Bgr(Color.Blue);
                try
                {
                    Image<Bgr, Byte> image = new Image<Bgr, byte>(img);


                    using (Image<Gray, byte> gray = image.Convert<Gray, Byte>())
                    {
                        _ocr.Recognize(gray);
                        Tesseract.Charactor[] charactors = _ocr.GetCharactors();
                        foreach (Tesseract.Charactor c in charactors)
                        {
                            image.Draw(c.Region, drawColor, 1);
                        }


                        re = _ocr.GetText();


                    }
                    return re;
                }
                catch (Exception ex)
                {

                    return re;
                }
            }
        }

Selenium內置了截圖方法,只能截取瀏覽器中顯示的內容,找到一個全屏截圖的方式(內置截圖+控制滾動條,圖片拼接)瀏覽器

public static Bitmap GetEntereScreenshot(IWebDriver _driver)
        {

            Bitmap stitchedImage = null;
            try
            {
                long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth");

                long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return  document.body.parentNode.scrollHeight");

                int totalWidth = (int)totalwidth1;
                int totalHeight = (int)totalHeight1;

                // Get the Size of the Viewport
                long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth");
                long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth");

                int viewportWidth = (int)viewportWidth1;
                int viewportHeight = (int)viewportHeight1;


                // Split the Screen in multiple Rectangles
                List<Rectangle> rectangles = new List<Rectangle>();
                // Loop until the Total Height is reached
                for (int i = 0; i < totalHeight; i += viewportHeight)
                {
                    int newHeight = viewportHeight;
                    // Fix if the Height of the Element is too big
                    if (i + viewportHeight > totalHeight)
                    {
                        newHeight = totalHeight - i;
                    }
                    // Loop until the Total Width is reached
                    for (int ii = 0; ii < totalWidth; ii += viewportWidth)
                    {
                        int newWidth = viewportWidth;
                        // Fix if the Width of the Element is too big
                        if (ii + viewportWidth > totalWidth)
                        {
                            newWidth = totalWidth - ii;
                        }

                        // Create and add the Rectangle
                        Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight);
                        rectangles.Add(currRect);
                    }
                }

                // Build the Image
                stitchedImage = new Bitmap(totalWidth, totalHeight);
                // Get all Screenshots and stitch them together
                Rectangle previous = Rectangle.Empty;
                foreach (var rectangle in rectangles)
                {
                    // Calculate the Scrolling (if needed)
                    if (previous != Rectangle.Empty)
                    {
                        int xDiff = rectangle.Right - previous.Right;
                        int yDiff = rectangle.Bottom - previous.Bottom;
                        // Scroll
                        //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));
                        ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));
                        System.Threading.Thread.Sleep(200);
                    }

                    // Take Screenshot
                    var screenshot = ((ITakesScreenshot)_driver).GetScreenshot();

                    // Build an Image out of the Screenshot
                    Image screenshotImage;
                    using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray))
                    {
                        screenshotImage = Image.FromStream(memStream);
                    }

                    // Calculate the Source Rectangle
                    Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height);

                    // Copy the Image
                    using (Graphics g = Graphics.FromImage(stitchedImage))
                    {
                        g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel);
                    }

                    // Set the Previous Rectangle
                    previous = rectangle;
                }
            }
            catch (Exception ex)
            {
                // handle
            }
            return stitchedImage;
        }

最後的是根據傳入的元素和全屏截圖,獲取到價格元素的圖片oop

public static Image SnapshotV2(this IWebElement element, Bitmap bitmap)
        {
            Size size = new Size(
                   Math.Min(element.Size.Width, bitmap.Width),
                   Math.Min(element.Size.Height, bitmap.Height));
            Rectangle crop = new Rectangle(element.Location, size);
            return bitmap.Clone(crop, bitmap.PixelFormat);
        }

運行效果以下
image網站

相關文章
相關標籤/搜索