若有侵權,請聯繫做者刪除html
水平有限,還望大牛指點java
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
import com.sun.tools.doclets.formats.html.SourceToHTMLConverter; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.apache.commons.lang.StringUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created with Chenquan. * Description: 淘寶抓包 * Date: 2018-12-13 * Time: 15:12 */ public class TaobaoCatch { public static void main(String[] args) { int i = 0; /* String url = "https://acs.m.taobao.com/h5/mtop.taobao.wsearch.h5search/1.0/?jsv=2.3.16&appKey=12574478&t=1545023581359&sign=e3476c9041a75de0a9190da470204d93&api=mtop.taobao.wsearch.h5search&v=1.0&H5Request=true&ecode=1&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22q%22%3A%22%E4%BB%99%E6%B6%B5%E5%86%85%E8%A1%A3%22%2C%22search%22%3A%22%E6%8F%90%E4%BA%A4%22%2C%22tab%22%3A%22all%22%2C%22sst%22%3A%221%22%2C%22n%22%3A20%2C%22buying%22%3A%22buyitnow%22%2C%22m%22%3A%22api4h5%22%2C%22token4h5%22%3A%22%22%2C%22abtest%22%3A%221%22%2C%22wlsort%22%3A%221%22%2C%22page%22%3A1%7D"; Connection con = Jsoup.connect(url); con.header("Cookie", "cna=TA+aFFGXQFUCAXQaRYGZVU8Q; t=efa81a9785cd86f885e13998b6d5f9cb; thw=cn; uc3=vt3=F8dByRzMU9X8Hvccr00%3D&id2=W8zLpWipxVFu&nk2=0PLo6GHZOM8%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu9648%5Cu94E81992; lgc=%5Cu9648%5Cu94E81992; _cc_=Vq8l%2BKCLiw%3D%3D; tg=0; enc=4rB%2FfKFx8DJKgPpoHlZjr824CEYw%2BlPaKBDWbFO4fnh6svGA97NoZNGERui4fOo2tXSnSVN1ygkfn5R5ekztTQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_1; _m_h5_tk=e501ac7690832934d663aef19ee36be5_1545033419107; _m_h5_tk_enc=5147579a652b4fb508dc886d59c37045; isg=BFVVgDOkpYNz64H7Z31pC9thZFHP-goqhI4h7tf6EUwbLnUgn6IZNGPv_DSYLiEc"); // con.header("referer", "https://item.taobao.com/item.htm "); Connection.Response resp = null; try { resp = con.method(Connection.Method.GET).ignoreContentType(true).execute(); } catch (IOException e) { e.printStackTrace(); } String body = resp.body(); // System.out.println(body); body = body.substring(12, body.length() - 1); JSONObject jb = JSONObject.fromObject(body); JSONArray jsonArray = jb.getJSONObject("data").getJSONArray("listItem"); // while(i<100){ i++; for (int j = 0; j < jsonArray.size(); j++) { JSONObject jsonObject = jsonArray.getJSONObject(j); String item_id = jsonObject.getString("item_id"); System.out.println("item_id: "+item_id); getAll(item_id); } // }*/ //傳連接上的產品id getAll("577996531297"); } public static void getAll(String item_id ) { try { Thread.sleep(2000);//一個休息5s,太快會被禁 } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("開始時間:" + new Date()); Date dateStart = new Date(); Document doc = null; String id = ""; try { // int i = 0; // while(i < 10000){ // i++; String url = "https://item.taobao.com/item.htm?id="+item_id; id = getParamByUrl(url, "id"); doc = Jsoup.connect(url).ignoreContentType(true).get(); /* String url = "https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=";//手機的html 5 頁面 ,爲了獲取庫存、價格 String enc = "{\"itemNumId\":\"582061497975\"}"; String gbk = URLEncoder.encode(enc, "utf-8"); String sds = url + gbk; System.out.println("庫存、價格"+sds); doc = Jsoup.connect(sds).ignoreContentType(true).get();*/ //設置請求頭 // Connection con = Jsoup.connect(url); // con.header("Cookie", " enc=1LWJWtPGgf6MF1NVsn2rbeRb3%2FU1%2Fk5ZiiedHbVedmxmfvUUWDPmFeyKeLYl7NVchBB19JCIVnX0eFv4otK9HA%3D%3D;" + // "x5sec=7b2264657461696c736b69703b32223a226235653133353933646637396131353230343663346139633633653038326465434c6a4e7a654146454e447739724732716644534b426f4c4f4455774d7a51304e7a4d794f7a453d227d;" ); // con.header("referer", "https://item.taobao.com/item.htm "); // Connection.Response resp=con.method(Connection.Method.GET).execute(); // Map<String,String> cookies = resp.cookies(); // Connection.Request request = con.request(); // String body = resp.body(); } catch (IOException e) { e.printStackTrace(); } if (doc.baseUri().contains("tmall")) { System.out.println("商品名稱:"+ doc.select("h1[data-spm=\"1000983\"]").text()); }else { System.out.println("商品名稱:" + doc.select("h3[class=\"tb-main-title\"]").text()); } Elements imgSrcElement = doc.select("#J_UlThumb > li"); for (Element element : imgSrcElement) { String imgSrc = ""; if (element.baseUri().contains("tmall")){ imgSrc = element.getElementsByTag("img").attr("src"); }else{ imgSrc = element.getElementsByTag("img").attr("data-src"); } imgSrc = imgSrc.replaceFirst("//img.alicdn.com/imgextra/", ""); imgSrc = imgSrc.substring(0, imgSrc.length() - 10); // imgSrc = imgSrc.replaceAll("_60x60q90.jpg",""); //處理掉沒必要要的數據 System.out.println("主圖url:" + imgSrc); } // 規格參數 Elements selectRules = doc.select(".J_TSaleProp"); List<List<String>> liHashMap = new ArrayList<>(); for (Element ulElement : selectRules) { String ul = ulElement.getElementsByTag("ul").attr("data-property"); System.out.println("ul:" + ul); List<String> liString = new ArrayList<>(); for (Element liElement : ulElement.getElementsByTag("li")) { String liDataValue = liElement.getElementsByTag("li").attr("data-value"); System.out.println("liDataValue: " + liDataValue); liString.add(liDataValue); String aStyle = liElement.getElementsByTag("a").attr("style"); if (StringUtils.isNotBlank(aStyle)) { aStyle = aStyle.replaceAll("background:url\\(", ""); aStyle = aStyle.substring(0, aStyle.length() - 29); // aStyle = aStyle.replaceAll("_40x40q90.jpg\\) center no-repeat;", ""); System.out.println("aStyle: " + aStyle); } String spanText = liElement.getElementsByTag("span").text(); if (StringUtils.isNotBlank(spanText)) { System.out.println("spanText: " + spanText); } } liHashMap.add(liString); } List<String> combination = test.combination(liHashMap); //獲取價格、庫存 Elements eles = doc.getElementsByTag("script"); for (Element ele : eles) { String s = ele.toString(); if (!ele.baseUri().contains("tmall")) {//淘寶 String rgex = ""; String subUtilSimple = ""; if (s.contains("skuMap")) { //獲取sku的id rgex = "skuMap(.*?)propertyMemoMap"; String skuId = s.replaceAll("\\s*", ""); // System.out.println(s); subUtilSimple = getSubUtilSimple(skuId, rgex); subUtilSimple = subUtilSimple.substring(1, subUtilSimple.length() - 1); // JSONObject jb = JSONObject.fromObject(subUtilSimple); JSONObject finalJb = jb; List<String> skuList = new ArrayList<>(); combination.forEach(p->{ JSONObject jsonObject = finalJb.getJSONObject(";" + p + ";"); if (!jsonObject.isNullObject()) { String o = jsonObject.getString("skuId"); System.out.println("sku的id: " + o); skuList.add(o); } }); String url = "https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=";//手機的html 5 頁面 ,爲了獲取庫存、價格 String enc = "{\"itemNumId\":\"" + id + "\"}"; String substore = ""; String store = ""; try { String gbk = URLEncoder.encode(enc, "utf-8"); String sds = url + gbk; System.out.println("庫存、價格" + sds); doc = Jsoup.connect(sds).ignoreContentType(true).get(); store = doc.toString(); rgex = "sku2info(.*?)skuItem"; substore = getSubUtilSimple(store, rgex); substore = substore.substring(3, substore.length() - 3); String sub = substore.replaceAll("\\\\", "").replaceAll("\\s*", ""); JSONObject sb = JSONObject.fromObject(sub); skuList.stream().forEach(p->{ if (sb.has(p)) {//判斷是否有值,沒值不取,否則會報錯 String string = sb.getString(p); System.out.println("淘寶的價格庫存==============" + string); } }); } catch (Exception e) { System.out.println("報錯的地方store:" + store); // System.out.println("報錯的地方substore:" + substore); e.printStackTrace(); System.out.println("=====================================程序報錯,提早結束===================================================" ); return; } } if (s.contains("descUrl") && s.contains("counterApi")) { // System.out.println(s); //詳情連接 rgex = "protocol(.*?)desc\\.alicdn\\.com"; subUtilSimple = getSubUtilSimple(s, rgex); subUtilSimple = subUtilSimple.substring(14, subUtilSimple.length() - 7); System.out.println("詳情連接: " + subUtilSimple); try { doc = Jsoup.connect("http:" + subUtilSimple).get(); } catch (IOException e) { e.printStackTrace(); } Elements imgDetail = doc.getElementsByTag("img"); for (Element element : imgDetail) { String imgSrc = element.getElementsByTag("img").attr("src"); // imgSrc = imgSrc.replaceFirst("//img.alicdn.com/imgextra/",""); // imgSrc = imgSrc.replaceAll("_60x60q90.jpg",""); //處理掉沒必要要的數據 if (StringUtils.isNotBlank(imgSrc)) { System.out.println("詳情圖url:" + imgSrc); } } } }else {//天貓的 if (s.contains("TShop.Setup")) { // String rgex = "<bdocid>(.*?)</bdocid>"; String rgex = "skuMap(.*?)salesProp"; String subUtilSimple = getSubUtilSimple(s, rgex); subUtilSimple = subUtilSimple.substring(2, subUtilSimple.length() - 2); JSONObject jb = JSONObject.fromObject(subUtilSimple); List<String> skuList = new ArrayList<>(); combination.forEach(p->{ JSONObject jsonObject = jb.getJSONObject(";" + p + ";"); if (!jsonObject.isNullObject()) { String skuId = jsonObject.getString("skuId"); System.out.println(skuId); skuList.add(skuId); } }); //庫存、價格 String url = "https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=";//手機的html 5 頁面 ,爲了獲取庫存、價格 String enc = "{\"itemNumId\":\"" + id + "\"}"; String substore = ""; String store = ""; try { String gbk = URLEncoder.encode(enc, "utf-8"); String sds = url + gbk; System.out.println("庫存、價格" + sds); doc = Jsoup.connect(sds).ignoreContentType(true).get(); store = doc.toString(); rgex = "sku2info(.*?)skuItem"; substore = getSubUtilSimple(store, rgex); substore = substore.substring(3, substore.length() - 3); String sub = substore.replaceAll("\\\\", "").replaceAll("\\s*", ""); JSONObject sb = JSONObject.fromObject(sub); skuList.stream().forEach(p->{ if (sb.has(p)) {//判斷是否有值,沒值不取,否則會報錯 String string = sb.getString(p); System.out.println("天貓的價格庫存==============" + string); } }); } catch (Exception e) { System.out.println("報錯的地方store:" + store); // System.out.println("報錯的地方substore:" + substore); e.printStackTrace(); System.out.println("=====================================程序報錯,提早結束===================================================" ); return; } //詳情連接 rgex = "httpsDescUrl(.*?)fetchDcUrl"; subUtilSimple = getSubUtilSimple(s, rgex); subUtilSimple = subUtilSimple.substring(3, subUtilSimple.length() - 3); System.out.println(subUtilSimple); try { doc = Jsoup.connect("http:"+subUtilSimple).get(); } catch (IOException e) { e.printStackTrace(); } Elements imgDetail = doc.getElementsByTag("img"); for (Element element :imgDetail) { String imgSrc = element.getElementsByTag("img").attr("src"); // imgSrc = imgSrc.replaceFirst("//img.alicdn.com/imgextra/",""); // imgSrc = imgSrc.replaceAll("_60x60q90.jpg",""); //處理掉沒必要要的數據 System.out.println("詳情圖url:"+imgSrc); } break; } } } System.out.println("結束時間:" + new Date()); Date dateEnd = new Date(); long number = dateEnd.getTime()-dateStart.getTime(); //而後在將毫秒轉換爲date類型就能夠了 System.out.println("時間差爲: "+number/1000); } /** * 返回單個字符串,若匹配到多個的話就返回第一個,方法與getSubUtil同樣 * * @param soap * @param rgex * @return */ public static String getSubUtilSimple(String soap, String rgex) { Pattern pattern = Pattern.compile(rgex);// 匹配的模式 Matcher m = pattern.matcher(soap); while (m.find()) { return m.group(1); } return ""; } /** * 獲取指定url中的某個參數 * * @param url * @param name * @return */ public static String getParamByUrl(String url, String name) { url += "&"; String pattern = "(\\?|&){1}#{0,1}" + name + "=[a-zA-Z0-9]*(&{1})"; Pattern r = Pattern.compile(pattern); Matcher m = r.matcher(url); if (m.find()) { // System.out.println(m.group(0)); return m.group(0).split("=")[1].replace("&", ""); } else { return null; } } }
import com.google.gson.JsonObject; import net.sf.json.JSONObject; import java.util.ArrayList; import java.util.List; /** * Created with Chenquan. * Description: * Date: 2018-12-16 * Time: 10:27 */ public class test { public static void main(String[] args) { List<String> li = new ArrayList<>(); li.add("aa"); li.add("bb"); li.add("cc"); List<String> bi = new ArrayList<>(); bi.add("ee"); bi.add("rr"); bi.add("tt"); List<String> ci = new ArrayList<>(); ci.add("yy"); ci.add("uu"); ci.add("ii"); List<List<String>> list = new ArrayList<>(); list.add(li); list.add(bi); list.add(ci); List<String> vv = new ArrayList<>(); List<String> combination = combination(list); System.out.println(combination); } /** * 若干個集合元素的組合 * * @param groups 多個集合 * @return 組合結果 */ public static List<String> combination(List<List<String>> groups) { if (invalid(groups) || invalid(groups.get(0))) { return null; } List<String> combine = groups.get(0); for (int i = 1; i < groups.size(); i++) { combine = cartesianProduct(combine, groups.get(i)); if (combine == null) { return null; } } return combine; } /** * 兩個集合元素的組合 * * @param c1 集合1 * @param c2 集合2 * @return 組合結果 */ public static List<String> cartesianProduct(List<String> c1, List<String> c2) { if (invalid(c1) || invalid(c2)) { return null; } List<String> combine = new ArrayList<>(); for (String s : c1) { for (String t : c2) { combine.add(String.format("%s;%s", s, t)); //combine.add(String.format("%s%s", t, s)); } } return combine; } /** * 驗證集合是否無效 * * @param c 集合 * @return true 無效 */ private static boolean invalid(List<?> c) { return c == null || c.isEmpty(); } }