上一篇 第一篇:速賣通產品採集系列 之 產品採集分析,對速賣通產品採集作了分析,包含要採集產品信息,以及如何採集這些產品信息,這一篇接着來採集實戰,相關技術前篇也說過了,不廢話直接開項目作。javascript
一, 建立解決方案,編寫採集代碼css
1. 建立解決方案「CollectorSolution」,在其中新建「Collector」 空 ASP.NET MVC 項目,解決方案結構圖以下:html
2.在「Collector」 項目中,分別新增「CollectingController」 控制器,以及和控制器相關的視圖,並將原來默認路由 Home -》 Index 改爲 Collecting -》 Index,截圖以下:前端
RouteConfig 修改爲以下:java
1 using System.Web.Mvc; 2 using System.Web.Routing; 3
4 namespace Collector 5 { 6 public class RouteConfig 7 { 8 public static void RegisterRoutes(RouteCollection routes) 9 { 10 routes.IgnoreRoute("{resource}.axd/{*pathInfo}"); 11
12 routes.MapRoute( 13 name: "Default", 14 url: "{controller}/{action}/{id}", 15 defaults: new { controller = "Collecting", action = "Index", id = UrlParameter.Optional } 16 ); 17 } 18 } 19 }
3. 分別新增「CollectionViewModel」 ,"CollectedProductViewModel","CollectedProductImageViewModel" 視圖模型,和一個存放正則表達式的結構體:「ParseProductPatterns」,代碼分別以下git
1.> CollectionViewModelgithub
1 using System.Collections.Generic; 2
3 namespace Collector.Models 4 { 5 public class CollectionViewModel 6 { 7 public CollectionViewModel() 8 { 9 ProductViews = new List<CollectedProductViewModel>(); 10 } 11 public string CollectionUrl { get; set; } 12 public IEnumerable<CollectedProductViewModel> ProductViews { get; set; } 13 } 14 }
2.> CollectedProductViewModel正則表達式
1 using System.Collections.Generic; 2
3 namespace Collector.Models 4 { 5 public class CollectedProductViewModel 6 { 7 public CollectedProductViewModel() 8 { 9 ProductImages = new List<CollectedProductImageViewModel>(); 10 } 11 public string ProductName { get; set; } 12 public decimal ProductPrice { get; set; } 13 public decimal ProductDiscountPrice { get; set; } 14 public string ProductCurrency { get; set; } 15 public string ProductColor { get; set; } 16 public string ProductSize { get; set; } 17 public IEnumerable<CollectedProductImageViewModel> ProductImages { get; set; } 18 } 19 }
3.>CollectedProductImageViewModelexpress
1 namespace Collector.Models 2 { 3 public class CollectedProductImageViewModel 4 { 5 public string ImageUrl { get; set; } 6 public int Sort { get; set; } 7 } 8 }
4.>ParseProductPatterns後端
namespace Collector.Models { public struct ParseProductPatterns { public static string ProductNamePattern = "(?<=<h1 class=\"product-name\" itemprop=\"name\">).*?(?=</h1>)"; public static string ProductJsnPattern = @"(?<=var skuProducts=).*?(?=;\s*var skuAttrIds=)"; public static string ProductImageJsonPattern = "(?<=window.runParams.imageBigViewURL=).*?(?=;)"; public static string ProductCurrencyPattern = "(?<=window.runParams.currencyCode=\").*?(?=\";)"; public static string ProductColorPattern =
"(?<=<a data-role=\"sku\" data-sku-id=\"{0}\" id=\"sku-1-{0}\" title=\").*?(?=\")"; public static string ProductSizePattern =
"(?<=<a data-role=\"sku\" data-sku-id=\"{0}\" id=\"sku-2-{0}\" href=\"javascript:void\\(0\\)\"\\s+><span>).*?(?=</)"; } }
基本上容易理解,我這裏就再也不一一講解了。
4. 視圖佈局設計很簡單,以下圖
採集地址 就是速賣通產品地址,這裏不支持店鋪和類型採集地址。表格就是採集產品信息展現。
5. 控制器和視圖代碼以下
1.> CollectingController
1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Text.RegularExpressions; 5 using System.Web.Mvc; 6 using Collector.Models; 7 using Newtonsoft.Json.Linq; 8 using RestSharp; 9
10 namespace Collector.Controllers 11 { 12 public class CollectingController : Controller 13 { 14 // GET: Collecting
15 public ActionResult Index() 16 { 17 return View(); 18 } 19
20 [HttpPost] 21 public ActionResult Index(CollectionViewModel collectionView) 22 { 23 collectionView = ColllectWithParse(collectionView); 24 return View(collectionView); 25 } 26
27 public CollectionViewModel ColllectWithParse(CollectionViewModel collectionView) 28 { 29 if (collectionView == null || string.IsNullOrEmpty(collectionView.CollectionUrl)) 30 { 31 return collectionView; 32 } 33 var client = new RestClient(collectionView.CollectionUrl); 34 var request = new RestRequest(Method.GET); 35 var response = client.Execute(request); 36 var htmlContent = response.Content; 37 collectionView.ProductViews = ParseProducts(htmlContent); 38 return collectionView; 39 } 40
41 public IEnumerable<CollectedProductViewModel> ParseProducts(string productHtmlContent) 42 { 43 var productName = RegexMatchValue(ParseProductPatterns.ProductNamePattern, productHtmlContent); 44 var productCuurency = RegexMatchValue(ParseProductPatterns.ProductCurrencyPattern, productHtmlContent); 45
46 var productJson = RegexMatchValue(ParseProductPatterns.ProductJsnPattern, productHtmlContent); 47
48 var prodctJsonArray = JArray.Parse(productJson); 49 var products =
50 prodctJsonArray.Select(pja =>
51 { 52 var colorWithSizeCode = pja["skuPropIds"].ToString().Split(','); 53 var priceJson = pja["skuVal"]; 54 var skuPrice = priceJson["skuPrice"]; 55 var price = skuPrice == null ? "0" : skuPrice.ToString(); 56 var actSkuPrice = priceJson["actSkuPrice"]; 57 var discountPrice = actSkuPrice == null ? "0" : actSkuPrice.ToString(); 58 return new
59 { 60 ColorCode = colorWithSizeCode.First(), 61 SizeCode = colorWithSizeCode.Last(), 62 Price = Convert.ToDecimal(price), 63 DiscountPrice = Convert.ToDecimal(discountPrice), 64 }; 65 }).ToList(); 66
67 var collectedImages = ParseProducImages(productHtmlContent); 68
69 var collectedProducts = products.Select(p => new CollectedProductViewModel 70 { 71 ProductName = productName, 72 ProductPrice = p.Price, 73 ProductDiscountPrice = p.DiscountPrice, 74 ProductCurrency = productCuurency, 75 ProductColor = SetProductColorWithSize(ParseProductPatterns.ProductColorPattern,p.ColorCode,productHtmlContent), 76 ProductSize = SetProductColorWithSize(ParseProductPatterns.ProductSizePattern, p.SizeCode, productHtmlContent), 77 ProductImages = collectedImages 78 }).ToList(); 79 return collectedProducts; 80 } 81
82 private IEnumerable<CollectedProductImageViewModel> ParseProducImages(string productHtmlContent) 83 { 84 var imagesJson = RegexMatchValue(ParseProductPatterns.ProductImageJsonPattern, productHtmlContent); 85 var imageJsonArray = JArray.Parse(imagesJson); 86
87 var images = imageJsonArray.ToObject<List<string>>(); 88 return images.Select((t, i) => new CollectedProductImageViewModel 89 { 90 ImageUrl = t, 91 Sort = i 92 }); 93 } 94
95 private string SetProductColorWithSize(string pattern, string colorWithSizeCode,string input) 96 { 97 var newPattern = string.Format(pattern, colorWithSizeCode); 98 return RegexMatchValue(newPattern, input); 99 } 100
101 private string RegexMatchValue(string pattern, string input, RegexOptions regexOptions = RegexOptions.IgnoreCase|RegexOptions.Singleline) 102 { 103 var regex = new Regex(pattern, regexOptions); 104 var match = regex.Match(input); 105 return match.Value; 106 } 107 } 108 }
2.> Collecting->Index
1 @model Collector.Models.CollectionViewModel 2 <!DOCTYPE html>
3
4 <html>
5 <head>
6 <meta name="viewport" content="width=device-width" />
7 <title></title>
8 <!-- CSS goes in the document HEAD or added to your external stylesheet -->
9 <style type="text/css">
10 table.gridtable {
11 font-family: verdana,arial,sans-serif;
12 font-size: 11px;
13 color: #333333;
14 border-width: 1px;
15 border-color: #666666;
16 border-collapse: collapse;
17 }
18
19 table.gridtable th {
20 border-width: 1px;
21 padding: 8px;
22 border-style: solid;
23 border-color: #666666;
24 background-color: #dedede;
25 }
26
27 table.gridtable td {
28 border-width: 1px;
29 padding: 8px;
30 border-style: solid;
31 border-color: #666666;
32 background-color: #ffffff;
33 }
34 </style>
35 </head>
36 <body>
37 <div>
38 @using (Html.BeginForm("Index", "Collecting", FormMethod.Post)) 39 { 40 <table>
41 <tr>
42 <td>採集地址:</td>
43 <td>
44 @Html.TextAreaFor(m => m.CollectionUrl, 4, 0, new { style = "width:1500px;" }) 45 </td>
46
47 </tr>
48 <tr><td colspan="2" style="text-align: right;"><input type="submit" value="開始採集" /></td></tr>
49 </table>
50 } 51 </div>
52 <div>
53 <table class="gridtable">
54 <thead>
55 <tr>
56 <th width="5%">編號</th>
57 <th width="5%">圖片</th>
58 <th width="30%">產品名稱</th>
59
60 <th width="10%">產品單價</th>
61 <th width="10%">產品參考單價</th>
62 <th width="10%">產品幣別</th>
63 <th width="10%">產品顏色</th>
64 <th width="10%">產品大小</th>
65 </tr>
66 </thead>
67 <tbody>
68 @{ 69 var i = 0; 70 if (Model == null || Model.ProductViews == null) 71 { 72 return; 73 } 74 } 75 @foreach (var collectedProduct in Model.ProductViews) 76 { 77 <tr>
78 <td align="center">@{i++;}@i</td>
79 <td><img src="@collectedProduct.ProductImages.FirstOrDefault().ImageUrl" width="60" height="60" /></td>
80 <td>@collectedProduct.ProductName</td>
81 <td>@collectedProduct.ProductDiscountPrice</td>
82 <td>@collectedProduct.ProductPrice</td>
83 <td>@collectedProduct.ProductCurrency</td>
84 <td>@collectedProduct.ProductColor</td>
85 <td>@collectedProduct.ProductSize</td>
86 </tr>
87 } 88
89 </tbody>
90
91 </table>
92 </div>
93 </body>
94 </html>
這裏要說明的是,本篇只是採集的冰山一角的例子,全部沒有搞得很複雜,沒有嚴格封裝,無論是前端,仍是後端,但願你們瞭解,還有本人不喜愛在代碼中加註釋,在我看來代碼就是註釋。
二, 測試結果,將MVC項目,部署到IIS,端口號1005,走起看效果。
1. 測試上一篇速賣通產品地址:
http://www.aliexpress.com/store/product/Yoga-Tops-Women-Women-Yoga-Shirts-Womens-Sportswear-Gym-Woman-Running-Shirt-Camisetas-Deporte-Mujer-Gym/1025110_32620359354.html?spm=a2g01.8032156.template-section-container.27.wcM8ES&sdom=3514.555719.493653.0_32620359354
效果截圖以下:
剛剛採集發現上一篇寫的這個產品地址,速賣通不打折,所以沒有了折扣價格。
2.再採集一個地址:
http://www.aliexpress.com/store/product/LEVEL-4-shock-Professional-running-intensive-training-without-rims-snow-sports-bra-open-front-zipper-style/1025110_32357688343.html?spm=2114.12010108.1000013.1.uvJqBj
截圖以下
這個產品的產品變體有不少,全部一網頁還顯示不了。
源碼碼:https://github.com/haibozhou1011/Collector
總結:
好了,速賣通產品採集系列,就所有結束了,總的來講,採集這個活技術都是你們常常用的,主要是前期分析,抓產品信息規則,每一個網站多有規律,你們留心觀察就會找到一些蛛絲馬跡,就會有所突破。但願你們若是有更好的採集方法,必定要和你們分享。