爬蟲項目知識點整理

爬蟲知識點整理php

1、開發環境配置html

    1.jdk1.8.0_91java

    2.數據庫MongoDB 2.6 Standardgit

    3.apache-activemq-5.13.2web

    4.commons-httpclient-3.0chrome

    5.服務器apache-tomcat-7.0.69數據庫

    6.webmagic-core-0.5.2apache

2、項目需求json

    a) 從全國企業信息系統爬取到企業的工商註冊號,而後根據工商註冊號獲取該企業的詳細信息api

    b)預設大概的郵編號碼,而後從中國郵政官網獲取該郵編下的地址信息。

3、企業詳細信息的獲取

     前置工做代碼以下

CloseableHttpClient client = HttpClientUtil.getHttpClient();
//		HttpGet get = new HttpGet();
		HttpPost post = new HttpPost();
		post.setURI(new URI("https://www.sgs.gov.cn/notice/search/ent_spot_check_list"));
		BasicNameValuePair simcard1 = new BasicNameValuePair("captcha", "");
		BasicNameValuePair simcard2 = new BasicNameValuePair("condition.pageNo", pageNo);
		BasicNameValuePair simcard3 = new BasicNameValuePair("condition.insType", "1");
		BasicNameValuePair simcard4 = new BasicNameValuePair("session.token", "e07b51eb-8c11-4fea-9e3f-f9cb3c0b20e9");
		BasicNameValuePair simcard5 = new BasicNameValuePair("condition.keyword", "");
		List<BasicNameValuePair> formParams = new ArrayList<BasicNameValuePair>();
		formParams.add(simcard1);
		formParams.add(simcard2);
		formParams.add(simcard3);
		formParams.add(simcard4);
		formParams.add(simcard5);
		UrlEncodedFormEntity uefEntity = new UrlEncodedFormEntity((List<? extends org.apache.http.NameValuePair>) formParams,"UTF-8");
		post.setEntity(uefEntity);
		
		CloseableHttpResponse response =  client.execute(post);
		List<String> gongshangxinxi = new ArrayList<String>();
		String s = "";
		 if(response.getStatusLine().getStatusCode() == 200){
             try {
                 HttpEntity resEntity = response.getEntity();
                 //解析頁面
                 if (resEntity != null) {
                    s=EntityUtils.toString(resEntity,"UTF-8");
                    Document doc = Jsoup.parse(s.toString());
	                  Elements content = doc.getElementsByClass("center");
	                  for(int i=0;i<content.size();i++){
	                	  String target = content.get(i).getElementsByClass("center").text();
	                	  gongshangxinxi.add(target);
	                  }
                 }
                 //配置任務
                 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<>();
                 for(int i=0;i<gongshangxinxi.size();i++){
                	 if(i%2==0){
                		 taskUrlCfgInfos.add(new DataUrlCfgInfo("全國公示網","CMB00004","http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch&searchType=2&keyWords="+gongshangxinxi.get(i),"","html", "defaultResoureProcsor","企業信息"));
                	 }
                 }
                 //將任務插入db
                 DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("工商信息獲取","CMB00004","在全國公示網抓取企業信息數據", taskUrlCfgInfos);
                 getMongoTemplate().insert(dataColctnTaskInfo);
                 logger.info("添加任務:"+dataColctnTaskInfo.getTaskCode()+"成功");
                 EntityUtils.consume(resEntity);
             } finally {
                 response.close();
             }
         }

該段代碼主要是新建了一個post請求,添加了幾個請求參數,執行請求獲取到頁面信息,並將解析封裝好的信息插入到數據庫中。

httpClientUtil代碼以下

private static DefaultHttpClient httpClient;
	  public static synchronized DefaultHttpClient getHttpClient() {
		  
	        if (null == httpClient) {
	            // 初始化工做
	            try {
	                KeyStore trustStore = KeyStore.getInstance(KeyStore
	                        .getDefaultType());
	                trustStore.load(null, null);
	                SSLSocketFactoryEx ss = new SSLSocketFactoryEx(trustStore);
	                SSLSocketFactory sf = ss;
	                sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);  //容許全部主機的驗證
	 
	                HttpParams params = new BasicHttpParams();
//	                HttpProtocolParams.setVersion(params);
	                HttpProtocolParams.setContentCharset(params,
	                        HTTP.DEFAULT_CONTENT_CHARSET);
	                HttpProtocolParams.setUseExpectContinue(params, true);
	 
	                // 設置鏈接管理器的超時
	                ConnManagerParams.setTimeout(params, 500000);
	                
	                
	                // 設置鏈接超時
//	                HttpConnectionParams.s;
	                // 設置socket超時
//	                HttpConnectionParams.SO_TIMEOUT;
	 
	                // 設置http https支持
	                SchemeRegistry schReg = new SchemeRegistry();
	                schReg.register(new Scheme("http", PlainSocketFactory
	                        .getSocketFactory(), 80));
	                schReg.register(new Scheme("https", sf, 443));
	 
	                ClientConnectionManager conManager = new ThreadSafeClientConnManager(
	                        params, schReg);
	                ((ThreadSafeClientConnManager)conManager).setMaxTotal(50);
	                ((ThreadSafeClientConnManager)conManager).setDefaultMaxPerRoute(10);;
	 
	                httpClient = new DefaultHttpClient(conManager, params);
	                httpClient.getParams().setParameter("http.socket.timeout", new
	                		Integer(500000));
	            } catch (Exception e) {
	                e.printStackTrace();
	                return new DefaultHttpClient();
	            }
	        }
	        return httpClient;
	    }
	 static class SSLSocketFactoryEx extends SSLSocketFactory {
			 
		    SSLContext sslContext = SSLContext.getInstance("TLS");
		 
		    public SSLSocketFactoryEx(KeyStore truststore)
		            throws NoSuchAlgorithmException, KeyManagementException,
		            KeyStoreException, UnrecoverableKeyException {
		        super(truststore);
		 
		        TrustManager tm = new X509TrustManager() {
		 
		            @Override
		            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
		                return null;
		            }
		 	
		            @Override
		            public void checkClientTrusted(
		                    java.security.cert.X509Certificate[] chain, String authType)
		                    throws java.security.cert.CertificateException {
		 
		            }
		 
		            @Override
		            public void checkServerTrusted(
		                    java.security.cert.X509Certificate[] chain, String authType)
		                    throws java.security.cert.CertificateException {
		 
		            }
		        };
		 
		        sslContext.init(null, new TrustManager[] { tm }, null);
		    }
		 
		    @Override
		    public Socket createSocket(Socket socket, String host, int port,
		            boolean autoClose) throws IOException, UnknownHostException {
		        return sslContext.getSocketFactory().createSocket(socket, host, port,
		                autoClose);
		    }
		 
		    @Override
		    public Socket createSocket() throws IOException {
		        return sslContext.getSocketFactory().createSocket();
		    }
		}

httpclientUtil類的主要做用是容許全部主機ssl的https的驗證,並對httpclient作一些配置例如timeout,線程大小等設置

後置工做代碼以下

//獲取存入db的任務信息
List<DataUrlCfgInfo> dataUrlCfgInfos = dataColctnRequest.getDataUrlCfgInfos();
//循環任務信息將單條任務放入request中
FutureColctdDataInfoGettingRequest request = new FutureColctdDataInfoGettingRequest(dataUrlCfgInfo);
request.putExtra("serialNo",serialNo);
futureTasks.add(request.getFutureTask());
//爬蟲工具進行抓取任務
getSpider().addRequest(new Request[]{request});

//獲取爬取結果
ColctdDataInfo colctdDaraInfo = futureTask.get(5,TimeUnit.MINUTES);
SimpleHttpClientTool httClientTool = new SimpleHttpClientTool();
//將爬取結果做爲參數繼續進行爬取
colctdDataInfos.add(httClientTool.getColctdDataInfo(colctdDaraInfo));
DataColctnCreateFiles.createFile(httClientTool.getColctdDataInfo(colctdDaraInfo),null);//建立文件


        HttpClient client = new HttpClient();
		client.getHostConfiguration().setHost("www.sgs.gov.cn/lz/etpsInfo.do", 80, "http");
		HttpMethod method = getPostMethod(colctdDataInfo.getContents());// 使用POST方式提交數據
		client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
		Cookie[] cookies = client.getState().getCookies();
		StringBuffer tmpcookies = new StringBuffer();
		for (Cookie c : cookies) {
			tmpcookies.append(c.toString() + ";");
		}
        //設置來源url,很是重要
		method.setRequestHeader("Referer", "http://www.sgs.gov.cn/lz/etpsInfo.do?method=doSearch");
		method.setRequestHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36");

		int statusCode = client.executeMethod(method);
		ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo();

		if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
			// 從頭中取出轉向的地址
			Header locationHeader = method.getResponseHeader("location");
			String location = null;
			if (locationHeader != null) {
				location = locationHeader.getValue();
				System.out.println("SGSLogin:" + location);
			} else {
				System.err.println("Location field value is null.");
			}
			return null;
		} else {
			// System.out.println(method.getStatusLine());

			try {
				String str = "";
				str = method.getResponseBodyAsString();
				List<String> list = new ArrayList<String>();
				list.add(str);
				colctdDataInfo1.setContents(list);
				colctdDataInfo1.setDataUrlCfgInfo(colctdDataInfo.getDataUrlCfgInfo());
				colctdDataInfo1.setVersionNo(colctdDataInfo.getVersionNo());
				colctdDataInfo1.setCreateDate(colctdDataInfo.getCreateDate());
				method.releaseConnection();
			} catch (IOException e) {
				e.printStackTrace();
			}
			return colctdDataInfo1;
		}

這段代碼的大體流程就是首先從DB中取到爬取任務的信息,而後循環取出來以後,使用爬蟲工具爬取到結果(也就是工商註冊號),再把爬取結果做爲參數傳遞給一個post請求,繼續進行爬取獲取到頁面信息,將頁面信息保存在文件中。

4、郵編地址信息的獲取

   前置代碼工做(拼裝郵編號,將郵編號碼插入db)以下

   北京市爲例(郵編以10開頭市級部分最大26區級部分取到99)

List<StringBuilder> shi = new ArrayList<StringBuilder>();
List<StringBuilder> location = new ArrayList<StringBuilder>();
		for(int j=0;j<27;j++){//查看了北京市的郵編髮現市級部分最大26
			StringBuilder sb = new StringBuilder();
			if(j<10){
				sb.append("10"+"0"+j);
			}else{
				sb.append("10"+j);
			}
			shi.add(sb);
		}
		for(int i=0;i<shi.size();i++){
		for(int j =0;j<100;j++){//具體地址設置到99全覆蓋
			StringBuilder sb = new StringBuilder();
			if(j<10){
				sb.append(shi.get(i)+"0"+j);
			}else{
				sb.append(shi.get(i).toString()+j+"");
			}
			location.add(sb);
		}
		}
		return location;

而後就是拼裝好了插入db,代碼以下

List<StringBuilder> quanguo = getBeiJingPost();
 //配置任務
 List<DataUrlCfgInfo> taskUrlCfgInfos = new ArrayList<DataUrlCfgInfo>();
 for(int i=0;i<quanguo.size();i++){
     taskUrlCfgInfos.add(new DataUrlCfgInfo("中國郵政官網","CMB0000801",quanguo.get(i).toString(),"","html", "defaultResoureProcsor","郵編信息"));
  }
 //將任務插入db
 DataColctnTaskInfo dataColctnTaskInfo = new DataColctnTaskInfo("郵編地址信息","CMB0000801","在中國郵政官網抓取地址信息數據", taskUrlCfgInfos);
 getMongoTemplate().insert(dataColctnTaskInfo);
 logger.info("添加任務:"+dataColctnTaskInfo.getTaskCode()+"成功");

後置代碼部分

因爲驗證碼的緣由,我這邊作了簡單的匹配像素取值的方法,研究下來發現若是匹配樣圖的數量在400張左右的狀況下,經過率大概在百分之三十以上,仍是能夠的。

下載圖片代碼以下

HttpClient httpClient = new HttpClient();  
   GetMethod getMethod = new GetMethod( "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=");  
        for (int i = 0; i < 100; i++) {  
            try {  
                // 執行getMethod  
                int statusCode = httpClient.executeMethod(getMethod);  
                if (statusCode != HttpStatus.SC_OK) {  
                    System.err.println("Method failed: "  
                            + getMethod.getStatusLine());  
                }  
                // 讀取內容  
                InputStream inputStream = getMethod.getResponseBodyAsStream();  
                OutputStream outStream = new FileOutputStream(new File(DOWNLOAD_DIR, i + ".png"));  
                IOUtils.copy(inputStream, outStream);  
                outStream.close();  
                System.out.println("OK!");  
            } catch (Exception e) {  
                e.printStackTrace();  
            } finally {  
                // 釋放鏈接  
                getMethod.releaseConnection();  
            }  
        }

下載好圖片以後對圖片進行處理,代碼以下

File dir = new File(DOWNLOAD_DIR);
File[] files = dir.listFiles(new ImageFileFilter("png"));
        
int counter = 0;
for (File file : files) {
   BufferedImage image = ImageIO.read(file);
   removeInterference(image); 
   List<BufferedImage> digitImageList = splitImage(image);
   for (int i = 0; i < digitImageList.size(); i++) {
        BufferedImage bi = digitImageList.get(i);
        ImageIO.write(bi, "PNG", new File(TRAIN_DIR, "temp_" + counter++ + ".png"));
       }
    }
System.out.println("生成供比對的圖片完畢,請到目錄中手工識別並重命名圖片,並刪除其它無關圖片!");

後置代碼部分

ColctdDataInfo colctdDataInfo1 = new ColctdDataInfo();
		String imageUrl = "http://www.cpdc.com.cn/web/api.php?op=checkcode&code_len=4&font_size=14&width=100&height=26&font_color=&background=&"+new Date().getTime();
		HttpClient httpClient = new HttpClient(new MultiThreadedHttpConnectionManager());
		httpClient.getParams().setParameter("http.protocol.content-charset", "utf-8");
		httpClient.getParams().setContentCharset("utf-8");
		httpClient.getParams().setSoTimeout(20000);
		
		ImageObject imageObject = getImage(imageUrl);
		ImageProcess process = new ImageProcess();
		String checkCode = process.getValidateCode(imageObject.getFile());
		long time = new Date().getTime();
		String makeUrl = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,pageNo,time);
		GetMethod getMethod = new GetMethod(makeUrl);
		getMethod.setRequestHeader("Cookie",imageObject.getCookies().toString());
		int statusCode = httpClient.executeMethod(getMethod);
		
		JSONObject json = (JSONObject) JSON.parse(IOUtils.toString(getMethod.getResponseBodyAsStream()));
		
		if(json.get("checkcode").equals(false)){
			getMethod.releaseConnection();
		}else{
			int pageCount = JSON.parseObject(json.getString("pageinfo")).getIntValue("TOTALPAGE");
			if(pageCount>0){
				for(int i=1;i<pageCount+1;i++){
					
					String url = makeUrl(dataUrlCfgInfo.getUrl(),checkCode,i,time);
					GetMethod getMethod1 = new GetMethod(url);
					getMethod1.setRequestHeader("Cookie",imageObject.getCookies().toString());
					int statusCode1 = httpClient.executeMethod(getMethod1);
					JSONObject json1 = (JSONObject) JSON.parse(IOUtils.toString(getMethod1.getResponseBodyAsStream()));
					System.out.println("抓取郵編地址成功!"+json1);
					colctdDataInfo1 = makeColctdDataInfo(json1,dataUrlCfgInfo,serialNo);
					DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件
				}
			}else if(pageCount==0){
				colctdDataInfo1 = makeColctdDataInfo(json,dataUrlCfgInfo,serialNo);
				DataColctnCreateFiles.createFile(colctdDataInfo1,null);//建立文件
			}
			getMethod.releaseConnection();
		}
		return colctdDataInfo1;

該段代碼主要流程是獲取到驗證碼的圖片,而後解析出具體驗證碼,將驗證碼做爲參數,另外將取得驗證碼操做的cookie保存下來,而後作一個get請求獲取到頁面信息,並保存下來頁面文件。

5、項目開發心得

單純從爬蟲這塊開發的話,首先要分析請求的類型,是get仍是post,而後看是不是http請求仍是https的請求。經過拼裝參數完成相關請求。其中有點就是關於動態繪製頁面信息的操做。因爲項目需求改動,以前關於動態繪製的代碼被刪除掉,這點我反而以爲是該項目我所學到的一些知識點吧,其中用到了selenium,並用到webDriver,new出一個chromeDriver,經過拼湊js,而後執行js,達到動態執行頁面的效果。其中我還大概研究了一下htmlUnit,這個是無瀏覽器執行頁面,也很方便。後續會進一步研究有關htmlUnit的部分。

相關文章
相關標籤/搜索