爬取網址 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.htmlhtml
由於數據比較大,存儲爲一個json,會內存溢出。java
因此按照每一個省市進行存儲。node
同時由於遠程訪問連接拿取數據,因此會將已經拿到網頁進行緩存,以便下次使用。json
package com.witwicky.jsoup; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.witwicky.vo.CrawlingVo; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Random; public class Crawling { private static final String BASE_SAVE_DIR = "E:\\工做\\extract"; private static final String RESULT_SAVE_DIR = "E:\\工做\\extract_result"; public static void main(String[] args) throws Exception { Gson gsonPretty = new GsonBuilder().setPrettyPrinting().create(); Gson gsonSimple = new GsonBuilder().create(); List<CrawlingVo> crawlingVos = new ArrayList<CrawlingVo>(); Elements select = getElements("index.html", "tr.provincetr > td > a"); for (Element element : select) { List<CrawlingVo> crawlingVos1 = new ArrayList<CrawlingVo>(); String val = element.attr("href"); crawlingVos.add(new CrawlingVo(val.substring(0, val.indexOf(".")), element.text(), crawlingVos1)); String baseUrl = element.attr("href"); String baseUrlPre = baseUrl.substring(0, baseUrl.indexOf(".")); Elements ele = getElements(baseUrl, "tr.citytr"); for (Element nextE : ele) { List<CrawlingVo> crawlingVos2 = new ArrayList<CrawlingVo>(); crawlingVos1.add(new CrawlingVo(nextE.select("td:eq(0) a").text(), nextE.select("td:eq(1) a").text(), crawlingVos2)); String href = nextE.select("td:eq(1) a").attr("href"); String substring = href.substring(0, baseUrl.indexOf(".")); Elements contryElements = getElements(href, "tr.countytr"); for (Element contryElement : contryElements) { List<CrawlingVo> crawlingVos3 = new ArrayList<CrawlingVo>(); crawlingVos2.add(new CrawlingVo(contryElement.select("td:eq(0) a").text(), contryElement.select("td:eq(1) a").text(), crawlingVos3)); String href1 = contryElement.select("td:eq(1) a").attr("href"); if (!"".equalsIgnoreCase(href1)) { String substring1 = href1.substring(0, baseUrl.indexOf(".")); Elements elements = getElements(substring + "/" + href1, "tr.towntr"); for (Element element1 : elements) { List<CrawlingVo> crawlingVos4 = new ArrayList<CrawlingVo>(); crawlingVos3.add(new CrawlingVo(element1.select("td:eq(0) a").text(), element1.select("td:eq(1) a").text(), crawlingVos4)); String href2 = element1.select("td:eq(1) a").attr("href"); Elements elements1 = getElements(baseUrlPre + "/" + substring1 + "/" + href2, "tr.villagetr"); for (Element element2 : elements1) { crawlingVos4.add(new CrawlingVo(element2.select("td:eq(0)").text(), element2.select("td:eq(2)").text(), new ArrayList<CrawlingVo>())); } } } } } save2File(gsonSimple.toJson(crawlingVos), element.text() + ".json", RESULT_SAVE_DIR); save2File(gsonPretty.toJson(crawlingVos), element.text() + "_pretty.json", RESULT_SAVE_DIR); System.out.println(element.text() + " is complete!"); } } private static Elements getElements(String u, String selector) throws IOException, InterruptedException { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + u; String cleanUrl = cleanName(url); Document select = null; File localFile = new File(BASE_SAVE_DIR, cleanUrl); if (localFile.exists()) { select = Jsoup.parse(localFile, "UTF-8"); } boolean remoteUrl = false; if (select == null) { int intRd = new Random().nextInt(5) + 1; Thread.sleep(intRd * 1000); select = Jsoup.connect(url).get(); remoteUrl = true; } if (remoteUrl) { save2File(select.toString(), cleanName(url), BASE_SAVE_DIR); } return select.select(selector); } private static String cleanName(String name) { return name .replace("\\", "_") .replace("/", "_") .replace("//", "_") .replace(".", "_") .replace(":", "_"); } private static void save2File(String content, String fileName, String saveDir) { File dir = new File(saveDir); if (!dir.exists()) { boolean mkdirs = dir.mkdirs(); if (!mkdirs) { return; } } File file = new File(dir, fileName); if (file.exists()) { return; } try { FileOutputStream outSTr = new FileOutputStream(file); BufferedOutputStream Buff = new BufferedOutputStream(outSTr); Buff.write(content.getBytes()); Buff.flush(); Buff.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
package com.witwicky.vo; import java.util.List; public class CrawlingVo { private String value; private String label; private List<CrawlingVo> children; public CrawlingVo() { } public CrawlingVo(String value, String label, List<CrawlingVo> children) { this.value = value; this.label = label; this.children = children; } public String getValue() { return value; } public void setValue(String value) { this.value = value; } public String getLabel() { return label; } public void setLabel(String label) { this.label = label; } public List<CrawlingVo> getChildren() { return children; } public void setChildren(List<CrawlingVo> children) { this.children = children; } }