基於【 springBoot+jsoup】一 || 爬取全國行政區劃數據

1、代碼演示html

若是中途中斷,可進行刷選過濾已拉取省份數據java

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 19:37
 */
@RestController
public class CityController {

    @Autowired
    private ProvinceService provinceService;
    @Autowired
    private HttpUtil httpUtil;
    private String yearHref = "";
    private int index;

    // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
    @GetMapping("/start")
    public ResultTemplate<String> spider() throws Exception {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
        String charset = "gb2312";
        Document rootDoc = httpUtil.get(url, charset);

        if (rootDoc == null) {
            return of("fail");
        }
        Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
        // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
        yearHref = firstElement.select("a").get(0).attr("href"); // 最近一個年份的省份連接
        Document doc = httpUtil.get(yearHref, charset);
        // 遍歷全部的省
        Elements provinceElements = doc.getElementsByClass("provincetr");
        for (Element element : provinceElements) {
            Elements aEles = element.select("a");
            for (Element aEle : aEles) {
                String name = aEle.text();
                // 11.html
                String provincesHref = aEle.attr("href");
                String code = provincesHref.substring(0, provincesHref.indexOf("."));
                index = yearHref.lastIndexOf("/") + 1;
                // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
                provincesHref = yearHref.substring(0, index) + provincesHref;
                DicProvince province = new DicProvince()
                        .setProvinceName(name)
                        .setProvinceCode(code)
                        .setCountryId(1196612453660643329L)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
                    System.out.println("未執行市:" + name);
                } else {
                    System.out.println("開始時間:" + LocalDateTime.now());
                    System.out.println("省名稱:" + name);
                    Long id = provinceService.insertProvince(province);
                    getCites(provincesHref, charset, id);
                }
            }
        }
        return of("spider crawl end.");
    }

    private void getCites(String url, String charset, Long provinceId) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循環次數:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("請求網頁連接報錯");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("citytr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("a").get(1); // 第二個是市的名字
                String name = aEle.text();
                // 11/1101.html
                String cityHref = aEle.attr("href");
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                cityHref = yearHref.substring(0, index) + cityHref;
                DicCity city = new DicCity()
                        .setCityName(name)
                        .setCityCode(code)
                        .setProvinceId(provinceId)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCity(city);
                //Long id=1L;

                getDistrict(cityHref, charset, id);
            }
        }
    }

    // 區縣
    private void getDistrict(String url, String charset, Long idDis) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循環次數:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("請求網頁連接報錯");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("countytr");
            for (Element cityElement : cityElements) {
                try {
                    Element aEle = cityElement.select("a").get(1);
                    String name = aEle.text();
                    String cityHref = aEle.attr("href");
                    int start = cityHref.lastIndexOf("/") + 1;
                    String code = cityHref.substring(start, cityHref.indexOf("."));

                    int index = url.lastIndexOf("/") + 1;
                    cityHref = url.substring(0, index) + cityHref;

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertDistrict(district);
                    //Long id=1L;
                    getStreet(cityHref, charset, id);
                } catch (Exception e) {
                    System.out.println("市轄區");
                    Element aEle = cityElement.select("td").get(0);
                    String code = aEle.text();

                    Element aEle2 = cityElement.select("td").get(1);
                    String name = aEle2.text();

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
                    Long id = provinceService.insertDistrict(district);
                    System.out.println("執行完畢");

                }

            }
        }
    }

    // 街道
    private void getStreet(String url, String charset, Long idStr) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循環次數:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("請求網頁連接報錯");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("towntr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("a").get(1); // 第二個是市的名字
                String name = aEle.text();
                String cityHref = aEle.attr("href");
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                int index = url.lastIndexOf("/") + 1;
                cityHref = url.substring(0, index) + cityHref;
                DicStreet street = new DicStreet()
                        .setStreetName(name)
                        .setStreetCode(code)
                        .setDistrictId(idStr)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertStreet(street);
                //Long id=1L;
                getCommunity(cityHref, charset, id);
            }
        }
    }

    // 社區
    private void getCommunity(String url, String charset, Long idPro) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循環次數:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null;
                System.out.println("請求網頁連接報錯");
            }
        }
        i = 0;
        if (rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("villagetr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("td").get(0);
                String code = aEle.text();

                Element aEle2 = cityElement.select("td").get(1);
                String cl_code = aEle2.text();

                Element aEle3 = cityElement.select("td").get(2);
                String name = aEle3.text();

                DicCommunity community = new DicCommunity()
                        .setCommunityName(name)
                        .setCommunityCode(code)
                        .setClassificationCode(cl_code)
                        .setStreetId(idPro)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCommunity(community);
            }
        }
    }

}

 

2、HttppUtil工具類數據庫

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-20 9:17
 */
@Component
public class HttpUtil {
    public Document get(String url, String charset) throws IOException {
        String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
        URL url2 = new URL(url);
        HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
        connection.setRequestMethod("GET");
        //是否容許緩存,默認true。
        connection.setUseCaches(Boolean.FALSE);
        //設置請求頭信息
        connection.addRequestProperty("Connection", "close");
        connection.addRequestProperty("user-agent", userAgent);
        //設置鏈接主機超時(單位:毫秒)
        connection.setConnectTimeout(80000);
        //設置從主機讀取數據超時(單位:毫秒)
        connection.setReadTimeout(80000);
        //開始請求
        try {
            Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
            return doc;
        } catch (Exception e) {
            System.out.println("parse error: " + url);
        }
        return null;
    }

}

 

3、service部分,根據須要自行定義數據庫表緩存

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 20:41
 */
@Service
public class ProvinceServiceImpl implements ProvinceService {

    @Autowired
    private ProvinceMapper provinceMapper;
    @Autowired
    private CityMapper cityMapper;
    @Autowired
    private DistrictMapper districtMapper;
    @Autowired
    private StreetMapper streetMapper;
    @Autowired
    private CommunityMapper communityMapper;


    @Override
    public Long insertProvince(DicProvince dicProvince) {
        int res=0;
        while (res!=1){
            try {
                res=provinceMapper.insert(dicProvince);
            } catch (Exception e) {
                res=0;
                System.out.println("插入省數據失敗");
                e.printStackTrace();
            }
        }
        return dicProvince.getProvinceId();
    }

    @Override
    public Long insertCity(DicCity dicCity) {
        int res=0;
        while(res!=1){
            try {
                res=cityMapper.insert(dicCity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入市數據失敗");
                e.printStackTrace();
            }
        }
        return dicCity.getCityId();
    }


    @Override
    public Long insertDistrict(DicDistrict dicDistrict) {
        int res=0;
        while (res!=1){
            try {
                res=districtMapper.insert(dicDistrict);
            } catch (Exception e) {
                res=0;
                System.out.println("插入區縣數據失敗");
                e.printStackTrace();
            }
        }
        return dicDistrict.getDistrictId();
    }

    @Override
    public Long insertStreet(DicStreet dicStreet) {
        int res=0;
        while (res!=1){
            try {
                res=streetMapper.insert(dicStreet);
            } catch (Exception e) {
                res=0;
                System.out.println("插入街道數據失敗");
                e.printStackTrace();
            }
        }
        return dicStreet.getStreetId();
    }

    @Override
    public Long insertCommunity(DicCommunity dicCommunity) {
        int res=0;
        while (res!=1){
            try {
                res=communityMapper.insert(dicCommunity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入社區數據失敗");
                e.printStackTrace();
            }
        }
        return dicCommunity.getCommunityId();
    }


}
相關文章
相關標籤/搜索