在平常開發工做中,有時候咱們須要去一些網站上抓取數據,要想抓取數據,就必須先了解網頁結構,根據具體的網頁結構,編寫對應的程序對數據進行採集。最近恰好有一個需求,須要更新收貨地址。因爲系統現有的收貨地址是很早之前的數據了,用戶在使用的過程當中反映找不到用戶所在地的地址信息,所以對現有地址數據的更新也就提上了日程。html
經過查找,最終找到了中華人民共和國國家統計局官網上有須要的地址數據,官方渠道,數據的準確性、完整性都有保障。本文采用國家統計局截止到2016年7月31日公佈的數據(目前最新的數據)爲例子進行演示,有關頁面結構的分析我這裏就很少說了(開發基本技能),直接上代碼吧,並附上完整的DEMO。java
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>crawler</groupId> <artifactId>crawler</artifactId> <version>0.0.1-SNAPSHOT</version> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.5.1</version> <configuration> <source>1.6</source> <target>1.6</target> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.2</version> </dependency> <dependency> <groupId>net.sf.json-lib</groupId> <artifactId>json-lib</artifactId> <version>2.4</version> <classifier>jdk13</classifier><!--指定jdk版本--> </dependency> </dependencies> </project>
package crawler; import java.util.List; public class Location { String code; String name; List<Location> children; public Location(){ } public Location(String code, String name){ this.code = code; this.name = name; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<Location> getChildren() { return children; } public void setChildren(List<Location> children) { this.children = children; } }
package crawler; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import net.sf.json.JSONObject; public class TestMain { public static void main(String[] args) throws IOException { Document doc = Jsoup.connect("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html").get(); Element masthead = doc.select("div.xilan_con").first(). getElementsByClass("TRS_Editor").first(). getElementsByClass("TRS_PreAppend").first(); Elements allElements = masthead.getElementsByTag("p"); List<Location> provinceList = new ArrayList<Location>(); Location province = null; Location city = null; for(Element element : allElements){ String html = element.select("span[lang]").first().html(); String locationCode = TestMain.getLocationCode(html); String locationName = element.select("span[style]").last().html(); if(locationCode.endsWith("0000")){ //省或直轄市 province = new Location(locationCode, locationName); province.setChildren(new ArrayList<Location>()); provinceList.add(province); }else if(locationCode.endsWith("00")){ //市 city = new Location(locationCode, locationName); city.setChildren(new ArrayList<Location>()); province.getChildren().add(city); }else{ //縣或區 Location county = new Location(locationCode, locationName); city.getChildren().add(county); } } Location root = new Location("0", "root"); root.setChildren(provinceList); JSONObject jsonObj = JSONObject.fromObject(root); System.out.println(jsonObj.toString()); } public static String getLocationCode(String html){ String regEx="[^0-9]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(html); return m.replaceAll("").trim(); } }
歡迎轉載,轉載必須標明出處 node