爬取後保留的信息有,"標題","樓盤名稱","地址",html
https://suzhou.anjuke.com/sale/p{}
import requests from lxml import etree import csv class Anjuke(): def __init__(self): self.url_temp = "https://suzhou.anjuke.com/sale/p{}" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} def get_url_list(self): return [self.url_temp.format(i) for i in range(1, 3)] #這裏爬取1-3頁。 def pase_url(self, url): response = requests.get(url, headers=self.headers) return response.content.decode() def get_content_list(self, html_str): html = etree.HTML(html_str) content_list = [] div_list = html.xpath('//ul[@id="houselist-mod-new"]/li') for div in div_list: item = {} item["標題"] = div.xpath( './/div[@class="house-title"]/a/text()') item["標題"] = item["標題"][0].strip() item["樓盤名稱"] = div.xpath( './/div[@class="details-item"]/span[@class="comm-address"]/text()') item["樓盤名稱"] = item['樓盤名稱'][0].split("\xa0")[0].strip() item["地址"] = div.xpath( './/div[@class="details-item"]/span[@class="comm-address"]/text()') item["地址"] = item['地址'][0].split("\xa0")[-1].strip() content_list.append(item) return content_list def save_content_list(self, content_list): headers = ["標題","樓盤名稱","地址"] with open("信息.csv","w",encoding="utf-8-sig", newline="") as fp: writer = csv.DictWriter(fp, headers) writer.writeheader() writer.writerows(content_list) # for i in content_list: # print(i["title"]) def run(self): url_list = self.get_url_list() for url in url_list: html_str = self.pase_url(url) content_list = self.get_content_list(html_str) self.save_content_list(content_list) if __name__ == '__main__': Anjuke = Anjuke() Anjuke.run()