最近作了一個新項目,由於項目須要大量電影數據,貓眼電影又剛好有足夠的數據,就上貓眼爬數據了。html
一、先分析一下網頁地址,發現電影都是被排好序號了,這就很簡單了。web
二、在分析頁面,此次主要爬取黃色框中的內容。在瀏覽器中按F12檢查元素,只要把Div獲取出來就算完成了。瀏覽器
下面貼代碼:服務器
主函數ide
1 static void Main(string[] args) 2 { 3 int errorCount = 0;//計算爬取失敗的次數 4 int count = 450;//結束範圍 5 for (int i = 401; i <= count; i++) 6 { 7 Thread.Sleep(2000);//每隔兩秒爬取一次,不要給服務器壓力 8 try 9 { 10 HtmlWeb web = new HtmlWeb(); 11 //https://maoyan.com/films/1 12 web.OverrideEncoding = Encoding.UTF8; 13 HtmlDocument doc = web.Load($"https://maoyan.com/films/{i}");//把url中的1替換爲i 14 15 HtmlDocument htmlDoc = new HtmlDocument(); 16 string url = $"https://maoyan.com/films/{i}"; 17 18 //獲取電影名 19 HtmlNode MovieTitle = doc.DocumentNode.SelectSingleNode("//div[@class='movie-brief-container']/h1[@class='name']");//分析頁面結構後獲得的div 20 if (MovieTitle == null)//若是是null,那麼代表進入驗證頁面了,執行第二種方法 21 { 22 string urlResponse = URLRequest(url); 23 htmlDoc.LoadHtml(urlResponse); 24 MovieTitle = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='movie-brief-container']/h1[@class='name']"); 25 if (MovieTitle == null)//若是是null,那麼代表進入驗證頁面了。(第二種方法也失效) 26 { 27 //此處須要進入瀏覽器手動完成驗證 或者 自行分析驗證頁面實現自動驗證 28 } 29 } 30 string title = MovieTitle.InnerText; 31 //Console.WriteLine(MovieTitle.InnerText); 32 33 //獲取電影海報 34 HtmlNode MovieImgSrc = doc.DocumentNode.SelectSingleNode("//div[@class='celeInfo-left']/div[@class='avatar-shadow']/img[@class='avatar']"); 35 if (MovieImgSrc == null) 36 { 37 38 MovieImgSrc = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='celeInfo-left']/div[@class='avatar-shadow']/img[@class='avatar']"); 39 } 40 //Console.WriteLine(MovieImgSrc.GetAttributeValue("src", "")); 41 string imgurl = MovieImgSrc.GetAttributeValue("src", ""); 42 43 //電影類型 44 HtmlNodeCollection MovieTypes = doc.DocumentNode.SelectNodes("//div[@class='movie-brief-container']/ul/li[@class='ellipsis']"); 45 if (MovieTypes == null) 46 { 47 MovieTypes = htmlDoc.DocumentNode.SelectNodes("//div[@class='movie-brief-container']/ul/li[@class='ellipsis']"); 48 } 49 string types = "", artime = "", releasetime = ""; 50 51 foreach (var item in MovieTypes[0].ChildNodes) 52 { 53 if (item.InnerText.Trim() != "") 54 { 55 //Console.WriteLine(item.InnerText.Trim()); 56 types += item.InnerText.Trim() + "-"; 57 } 58 } 59 artime = MovieTypes[1].InnerText; 60 releasetime = MovieTypes[2].InnerText; 61 //Console.WriteLine(MovieTypes[i].InnerText); 62 63 //劇情簡介 64 string intro = ""; 65 HtmlNode introduction = doc.DocumentNode.SelectSingleNode("//div[@class='mod-content']/span[@class='dra']"); 66 if (introduction == null) 67 { 68 introduction = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='mod-content']/span[@class='dra']"); 69 } 70 //Console.WriteLine(introduction.InnerText); 71 intro = introduction.InnerText; 72 //Console.WriteLine(i); 73 74 using (FileStream fs = new FileStream(@"d:\Sql.txt", FileMode.Append, FileAccess.Write)) 75 { 76 fs.Lock(0, fs.Length); 77 StreamWriter sw = new StreamWriter(fs); 78 sw.WriteLine($"INSERT INTO Movies VALUES('{title}','{imgurl}','{types}','{artime}','{releasetime}','{intro.Trim()}');"); 79 fs.Unlock(0, fs.Length);//必定要用在Flush()方法之前,不然拋出異常。 80 sw.Flush(); 81 } 82 83 } 84 catch (Exception ex) 85 { 86 errorCount++; 87 Console.WriteLine(ex); 88 } 89 } 90 Console.WriteLine($"結束 成功:{count - errorCount}條,失敗:{errorCount}條"); 91 Console.ReadLine(); 92 93 }
URLRequest方法
1 static string URLRequest(string url) 2 { 3 // 準備請求 4 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 5 6 // 設置GET方法 7 request.Method = "GET"; 8 request.Timeout = 6000; //60 second timeout 9 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"; 10 11 string responseContent = null; 12 13 // 獲取 Response 14 using (WebResponse response = request.GetResponse()) 15 { 17 using (Stream stream = response.GetResponseStream()) 18 { 19 // 讀取流 20 using (StreamReader streamreader = new StreamReader(stream)) 21 { 23 responseContent = streamreader.ReadToEnd(); 24 } 25 } 26 } 27 28 return (responseContent); 29 }
先進入for循環,到時候把url中的 1 替換爲 i ,就能夠實現自動爬取全部電影了。函數
解析html代碼我用的是第三方類庫 HtmlAgilityPack,你們能夠在Nuget中搜索到。url
我將爬取的數據轉換爲Sql語句了,存在D盤根目錄下 Sql.txt。spa
下面是結果code
一共爬了200條數據htm
你們注意一下,程序報錯 若是是空異常,那麼代表沒有獲取到相應的div,沒有獲取到相應的div就代表貓眼讓你跳轉到驗證中心頁面了,你要進入到瀏覽器驗證一下,或者更換IP訪問。
最後再提醒一下你們,要慢慢的獲取數據,否則會403。