爬蟲知識點整理php
1、開發環境配置html
1.jdk1.8.0_91java
2.數據庫MongoDB 2.6 Standardgit
3.apache-activemq-5.13.2web
4.commons-httpclient-3.0chrome
5.服務器apache-tomcat-7.0.69數據庫
6.webmagic-core-0.5.2apache
2、項目需求json
a) 從全國企業信息系統爬取到企業的工商註冊號,而後根據工商註冊號獲取該企業的詳細信息api
b)預設大概的郵編號碼,而後從中國郵政官網獲取該郵編下的地址信息。
3、企業詳細信息的獲取
前置工做代碼以下
CloseableHttpClient client = HttpClientUtil.getHttpClient(); // HttpGet get = new HttpGet(); HttpPost post = new HttpPost(); post.setURI(new URI("https://www.sgs.gov.cn/notice/search/ent_spot_check_list")); BasicNameValuePair simcard1 = new BasicNameValuePair("captcha", ""); BasicNameValuePair simcard2 = new BasicNameValuePair("condition.pageNo", pageNo); BasicNameValuePair simcard3 = new BasicNameValuePair("condition.insType", "1"); BasicNameValuePair simcard4 = new BasicNameValuePair("session.token", "e07b51eb-8c11-4fea-9e3f-f9cb3c0b20e9"); BasicNameValuePair simcard5 = new BasicNameValuePair("condition.keyword", ""); List<BasicNameValuePair> formParams = new ArrayList<BasicNameValuePair>(); formParams.add(simcard1); formParams.add(simcard2); formParams.add(simcard3); formParams.add(simcard4); formParams.add(simcard5); UrlEncodedFormEntity uefEntity = new UrlEncodedFormEntity((List<? extends org.apache.http.NameValuePair>) formParams,"UTF-8"); post.setEntity(uefEntity); CloseableHttpResponse response = client.execute(post); List<String> gongshangxinxi = new ArrayList<String>(); String s = ""; if(response.getStatusLine().getStatusCode() == 200){ try { HttpEntity resEntity = response.getEntity(); //解析頁面 if (resEntity != null) { s=EntityUtils.toString(resEntity,"UTF-8"); Document doc = Jsoup.parse(s.toString()); Elements content = doc.getElementsByClass("center"); for(int i=0;i<content.size();i++){ String target = content.get(i).getElementsByClass("center").text(); gongshangxinxi.add(target); } } //配置任務 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<>(); for(int i=0;i<gongshangxinxi.size();i++){ if(i%2==0){ taskUrlCfgInfos.add(new DataUrlCfgInfo("全國公示網","CMB00004","http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch&searchType=2&keyWords="+gongshangxinxi.get(i),"","html", "defaultResoureProcsor","企業信息")); } } //將任務插入db DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("工商信息獲取","CMB00004","在全國公示網抓取企業信息數據", taskUrlCfgInfos); getMongoTemplate().insert(dataColctnTaskInfo); logger.info("添加任務:"+dataColctnTaskInfo.getTaskCode()+"成功"); EntityUtils.consume(resEntity); } finally { response.close(); } }
該段代碼主要是新建了一個post請求,添加了幾個請求參數,執行請求獲取到頁面信息,並將解析封裝好的信息插入到數據庫中。
httpClientUtil代碼以下
private static DefaultHttpClient httpClient; public static synchronized DefaultHttpClient getHttpClient() { if (null == httpClient) { // 初始化工做 try { KeyStore trustStore = KeyStore.getInstance(KeyStore .getDefaultType()); trustStore.load(null, null); SSLSocketFactoryEx ss = new SSLSocketFactoryEx(trustStore); SSLSocketFactory sf = ss; sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); //容許全部主機的驗證 HttpParams params = new BasicHttpParams(); // HttpProtocolParams.setVersion(params); HttpProtocolParams.setContentCharset(params, HTTP.DEFAULT_CONTENT_CHARSET); HttpProtocolParams.setUseExpectContinue(params, true); // 設置鏈接管理器的超時 ConnManagerParams.setTimeout(params, 500000); // 設置鏈接超時 // HttpConnectionParams.s; // 設置socket超時 // HttpConnectionParams.SO_TIMEOUT; // 設置http https支持 SchemeRegistry schReg = new SchemeRegistry(); schReg.register(new Scheme("http", PlainSocketFactory .getSocketFactory(), 80)); schReg.register(new Scheme("https", sf, 443)); ClientConnectionManager conManager = new ThreadSafeClientConnManager( params, schReg); ((ThreadSafeClientConnManager)conManager).setMaxTotal(50); ((ThreadSafeClientConnManager)conManager).setDefaultMaxPerRoute(10);; httpClient = new DefaultHttpClient(conManager, params); httpClient.getParams().setParameter("http.socket.timeout", new Integer(500000)); } catch (Exception e) { e.printStackTrace(); return new DefaultHttpClient(); } } return httpClient; } static class SSLSocketFactoryEx extends SSLSocketFactory { SSLContext sslContext = SSLContext.getInstance("TLS"); public SSLSocketFactoryEx(KeyStore truststore) throws NoSuchAlgorithmException, KeyManagementException, KeyStoreException, UnrecoverableKeyException { super(truststore); TrustManager tm = new X509TrustManager() { @Override public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } @Override public void checkClientTrusted( java.security.cert.X509Certificate[] chain, String authType) throws java.security.cert.CertificateException { } @Override public void checkServerTrusted( java.security.cert.X509Certificate[] chain, String authType) throws java.security.cert.CertificateException { } }; sslContext.init(null, new TrustManager[] { tm }, null); } @Override public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException, UnknownHostException { return sslContext.getSocketFactory().createSocket(socket, host, port, autoClose); } @Override public Socket createSocket() throws IOException { return sslContext.getSocketFactory().createSocket(); } }
httpclientUtil類的主要做用是容許全部主機ssl的https的驗證,並對httpclient作一些配置例如timeout,線程大小等設置
後置工做代碼以下
//獲取存入db的任務信息 List<DataUrlCfgInfo> dataUrlCfgInfos = dataColctnRequest.getDataUrlCfgInfos(); //循環任務信息將單條任務放入request中 FutureColctdDataInfoGettingRequest request = new FutureColctdDataInfoGettingRequest(dataUrlCfgInfo); request.putExtra("serialNo",serialNo); futureTasks.add(request.getFutureTask()); //爬蟲工具進行抓取任務 getSpider().addRequest(new Request[]{request}); //獲取爬取結果 ColctdDataInfo colctdDaraInfo = futureTask.get(5,TimeUnit.MINUTES); SimpleHttpClientTool httClientTool = new SimpleHttpClientTool(); //將爬取結果做爲參數繼續進行爬取 colctdDataInfos.add(httClientTool.getColctdDataInfo(colctdDaraInfo)); DataColctnCreateFiles.createFile(httClientTool.getColctdDataInfo(colctdDaraInfo),null);//建立文件 HttpClient client = new HttpClient(); client.getHostConfiguration().setHost("www.sgs.gov.cn/lz/etpsInfo.do", 80, "http"); HttpMethod method = getPostMethod(colctdDataInfo.getContents());// 使用POST方式提交數據 client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); Cookie[] cookies = client.getState().getCookies(); StringBuffer tmpcookies = new StringBuffer(); for (Cookie c : cookies) { tmpcookies.append(c.toString() + ";"); } //設置來源url,很是重要 method.setRequestHeader("Referer", "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch"); method.setRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"); int statusCode = client.executeMethod(method); ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo(); if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) { // 從頭中取出轉向的地址 Header locationHeader = method.getResponseHeader("location"); String location = null; if (locationHeader != null) { location = locationHeader.getValue(); System.out.println("SGSLogin:" + location); } else { System.err.println("Location field value is null."); } return null; } else { // System.out.println(method.getStatusLine()); try { String str = ""; str = method.getResponseBodyAsString(); List<String> list = new ArrayList<String>(); list.add(str); colctdDataInfo1.setContents(list); colctdDataInfo1.setDataUrlCfgInfo(colctdDataInfo.getDataUrlCfgInfo()); colctdDataInfo1.setVersionNo(colctdDataInfo.getVersionNo()); colctdDataInfo1.setCreateDate(colctdDataInfo.getCreateDate()); method.releaseConnection(); } catch (IOException e) { e.printStackTrace(); } return colctdDataInfo1; }
這段代碼的大體流程就是首先從DB中取到爬取任務的信息,而後循環取出來以後,使用爬蟲工具爬取到結果(也就是工商註冊號),再把爬取結果做爲參數傳遞給一個post請求,繼續進行爬取獲取到頁面信息,將頁面信息保存在文件中。
4、郵編地址信息的獲取
前置代碼工做(拼裝郵編號,將郵編號碼插入db)以下
北京市爲例(郵編以10開頭市級部分最大26區級部分取到99)
List<StringBuilder> shi = new ArrayList<StringBuilder>(); List<StringBuilder> location = new ArrayList<StringBuilder>(); for(int j=0;j<27;j++){//查看了北京市的郵編髮現市級部分最大26 StringBuilder sb = new StringBuilder(); if(j<10){ sb.append("10"+"0"+j); }else{ sb.append("10"+j); } shi.add(sb); } for(int i=0;i<shi.size();i++){ for(int j =0;j<100;j++){//具體地址設置到99全覆蓋 StringBuilder sb = new StringBuilder(); if(j<10){ sb.append(shi.get(i)+"0"+j); }else{ sb.append(shi.get(i).toString()+j+""); } location.add(sb); } } return location;
而後就是拼裝好了插入db,代碼以下
List<StringBuilder> quanguo = getBeiJingPost(); //配置任務 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<DataUrlCfgInfo>(); for(int i=0;i<quanguo.size();i++){ taskUrlCfgInfos.add(new DataUrlCfgInfo("中國郵政官網","CMB0000801",quanguo.get(i).toString(),"","html", "defaultResoureProcsor","郵編信息")); } //將任務插入db DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("郵編地址信息","CMB0000801","在中國郵政官網抓取地址信息數據", taskUrlCfgInfos); getMongoTemplate().insert(dataColctnTaskInfo); logger.info("添加任務:"+dataColctnTaskInfo.getTaskCode()+"成功");
後置代碼部分
因爲驗證碼的緣由,我這邊作了簡單的匹配像素取值的方法,研究下來發現若是匹配樣圖的數量在400張左右的狀況下,經過率大概在百分之三十以上,仍是能夠的。
下載圖片代碼以下
HttpClient httpClient = new HttpClient(); GetMethod getMethod = new GetMethod( "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background="); for (int i = 0; i < 100; i++) { try { // 執行getMethod int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { System.err.println("Method failed: " + getMethod.getStatusLine()); } // 讀取內容 InputStream inputStream = getMethod.getResponseBodyAsStream(); OutputStream outStream = new FileOutputStream(new File(DOWNLOAD_DIR, i + ".png")); IOUtils.copy(inputStream, outStream); outStream.close(); System.out.println("OK!"); } catch (Exception e) { e.printStackTrace(); } finally { // 釋放鏈接 getMethod.releaseConnection(); } }
下載好圖片以後對圖片進行處理,代碼以下
File dir = new File(DOWNLOAD_DIR); File[] files = dir.listFiles(new ImageFileFilter("png")); int counter = 0; for (File file : files) { BufferedImage image = ImageIO.read(file); removeInterference(image); List<BufferedImage> digitImageList = splitImage(image); for (int i = 0; i < digitImageList.size(); i++) { BufferedImage bi = digitImageList.get(i); ImageIO.write(bi, "PNG", new File(TRAIN_DIR, "temp_" + counter++ + ".png")); } } System.out.println("生成供比對的圖片完畢,請到目錄中手工識別並重命名圖片,並刪除其它無關圖片!");
後置代碼部分
ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo(); String imageUrl = "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=&"+new Date().getTime(); HttpClient httpClient = new HttpClient(new MultiThreadedHttpConnectionManager()); httpClient.getParams().setParameter("http.protocol.content-charset", "utf-8"); httpClient.getParams().setContentCharset("utf-8"); httpClient.getParams().setSoTimeout(20000); ImageObject imageObject = getImage(imageUrl); ImageProcess process = new ImageProcess(); String checkCode = process.getValidateCode(imageObject.getFile()); long time = new Date().getTime(); String makeUrl = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,pageNo,time); GetMethod getMethod = new GetMethod(makeUrl); getMethod.setRequestHeader("Cookie",imageObject.getCookies().toString()); int statusCode = httpClient.executeMethod(getMethod); JSONObject json = (JSONObject) JSON.parse(IOUtils.toString(getMethod.getResponseBodyAsStream())); if(json.get("checkcode").equals(false)){ getMethod.releaseConnection(); }else{ int pageCount = JSON.parseObject(json.getString("pageinfo")).getIntValue("TOTALPAGE"); if(pageCount>0){ for(int i=1;i<pageCount+1;i++){ String url = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,i,time); GetMethod getMethod1 = new GetMethod(url); getMethod1.setRequestHeader("Cookie",imageObject.getCookies().toString()); int statusCode1 = httpClient.executeMethod(getMethod1); JSONObject json1 = (JSONObject) JSON.parse(IOUtils.toString(getMethod1.getResponseBodyAsStream())); System.out.println("抓取郵編地址成功!"+json1); colctdDataInfo1 = makeColctdDataInfo(json1,dataUrlCfgInfo,serialNo); DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件 } }else if(pageCount==0){ colctdDataInfo1 = makeColctdDataInfo(json,dataUrlCfgInfo,serialNo); DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件 } getMethod.releaseConnection(); } return colctdDataInfo1;
該段代碼主要流程是獲取到驗證碼的圖片,而後解析出具體驗證碼,將驗證碼做爲參數,另外將取得驗證碼操做的cookie保存下來,而後作一個get請求獲取到頁面信息,並保存下來頁面文件。
5、項目開發心得
單純從爬蟲這塊開發的話,首先要分析請求的類型,是get仍是post,而後看是不是http請求仍是https的請求。經過拼裝參數完成相關請求。其中有點就是關於動態繪製頁面信息的操做。因爲項目需求改動,以前關於動態繪製的代碼被刪除掉,這點我反而以爲是該項目我所學到的一些知識點吧,其中用到了selenium,並用到webDriver,new出一個chromeDriver,經過拼湊js,而後執行js,達到動態執行頁面的效果。其中我還大概研究了一下htmlUnit,這個是無瀏覽器執行頁面,也很方便。後續會進一步研究有關htmlUnit的部分。