經過爬蟲爬取公共資源交易平臺(四川省)最近的招標信息 java
一:引入JSON的相關的依賴apache
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>json
二:經過請求的url獲取URLConnection鏈接服務器
package com.svse.pachong;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;appimport org.apache.log4j.Logger;
/**
* 經過請求的url獲取URLConnection鏈接
* @author lenovo
* @date 2019年1月22日
* description:
*/
public class open_url_test {
public static Logger logger = Logger.getLogger(open_url_test.class);測試
public boolean openurl(String url_infor) throws Exception{
URL url = new URL(url_infor);
// 鏈接類的父類,抽象類
URLConnection urlConnection = url.openConnection();
// http的鏈接類
HttpURLConnection httpURLConnection = (HttpURLConnection) urlConnection;
/* 設定請求的方法,默認是GET(對於知識庫的附件服務器必須是GET,若是是POST會返回405。網站流程附件遷移功能裏面必須是POST,有所區分。)*/
httpURLConnection.setRequestMethod("GET");
// 設置字符編碼 httpURLConnection.setRequestProperty("Charset", "UTF-8");
// 打開到此 URL引用的資源的通訊連接(若是還沒有創建這樣的鏈接)。
int code = httpURLConnection.getResponseCode();
System.out.println("code:"+code); //鏈接成功 200
try {
InputStream inputStream = httpURLConnection.getInputStream();
System.out.println("鏈接成功");
logger.info("打開"+url_infor+"成功!");
return true;
}catch (Exception exception){
logger.info("打開"+url_infor+"失敗!");
return false;
}
}
}ui
三:經過爬取的url解析想要的數據,並以json的格式返回編碼
package com.svse.pachong;url
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import net.sf.json.JSONException;
import net.sf.json.JSONObject;/**
* 經過爬取的url解析想要的數據,並以json的格式返回
* @param urlString 須要爬取的網站url路徑
* @return 返回json結果的數據
* @throws IOException
* @throws JSONException
*/
public class readData {
public static JSONObject readData(String urlString) throws IOException, JSONException{
InputStream is = new URL(urlString).openStream();
try {
BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
StringBuilder sb = new StringBuilder();
int cp;
while ((cp = rd.read()) != -1) {
sb.append((char) cp);
}
String jsonText = sb.toString();
JSONObject json = JSONObject.fromObject(jsonText);
return json;
} finally {
is.close();
}
}
}
四:爬取入口
package com.svse.pachong;
import java.io.IOException;
import net.sf.json.JSONArray;
import net.sf.json.JSONException;
import net.sf.json.JSONObject;/**
* 爬取的入口
* @author lenovo
* @date 2019年1月22日
* description:
*/
public class Main {
static String urlString = "http://www.scggzy.gov.cn/Info/GetInfoListNew?keywords=×=4×Start=×End=&province=&area=&businessType=&informationType=&industryType=&page=1&parm=1534929604640";
@SuppressWarnings("static-access")
public static void main(String[] args) {
open_url_test oUrl = new open_url_test();
try {
if (oUrl.openurl(urlString)) {
readData rData = new readData();
JSONObject json = rData.readData(urlString);
JSONObject ob=JSONObject.fromObject(json);
String data=ob.get("data").toString(); //JSONObject 轉 String
data="["+data.substring(1,data.length()-1)+"]";
JSONArray json2=JSONArray.fromObject(data); //String 轉 JSONArray
for (int i = 0; i < 10; i++) {
JSONObject jsonObject = (JSONObject) json2.get(i);
System.out.println("--------------------------------------------");
System.out.println("項目: "+jsonObject.get("Title"));
System.out.println("時間: "+jsonObject.get("CreateDateStr"));
System.out.println(jsonObject.get("TableName"));
System.out.println(jsonObject.get("Link"));
System.out.println( jsonObject.get("province") +" "+jsonObject.get("username")+" "+jsonObject.get("businessType")+" "+jsonObject.get("NoticeType"));
}
}else{
System.out.println("解析數據失敗!");
}
} catch (JSONException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}}
四:測試結果
至此,整個爬取的任務就結束了!