這裏須要導入requests庫和BeautifulSoup庫和lxml庫html
lxml是python的一個解析庫,支持HTML和XML的解析,支持XPath解析方式,並且解析效率很是高python
這些庫均可以在pycharm中的Python解釋器中添加瀏覽器
打開網頁,在瀏覽器的開發者工具中獲取請求頭session
這是作好的代碼,拿去須要修改請求頭纔可運行app
import requests from bs4 import BeautifulSoup headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', 'Cookie': 'antipas=145M204676TL713539C97; uuid=db5afeb9-3098-4470-89bb-1dc454285b62; cityDomain=zhangzhou; clueSourceCode=%2A%2300; user_city_id=80; ganji_uuid=6283474678349984657438; sessionid=39d4b3ef-4a48-49c7-da02-9255c73b1379; lg=1; Hm_lvt_bf3ee5b290ce731c7a4ce7a617256354=1606212665; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22self%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22db5afeb9-3098-4470-89bb-1dc454285b62%22%2C%22ca_city%22%3A%22zhangzhou%22%2C%22sessionid%22%3A%2239d4b3ef-4a48-49c7-da02-9255c73b1379%22%7D; preTime=%7B%22last%22%3A1606212670%2C%22this%22%3A1606212663%2C%22pre%22%3A1606212663%7D; Hm_lpvt_bf3ee5b290ce731c7a4ce7a617256354=1606212670'} # 獲取詳情頁面url def get_detail_urls(url): resp = requests.get(url, headers=headers) html = resp.content.decode('utf-8') # print(html) soup = BeautifulSoup(html, 'lxml') lis = soup.find('ul', class_='carlist').find_all('li') detail_urls = [] for li in lis: detail_url ='https://www.guazi.com'+li.find('a')['href'] # print(detail_url) detail_urls.append(detail_url) return detail_urls #解析詳情頁面內容 def parse_detail_url(url,f): # 解析詳情頁面內容 resp = requests.get(url, headers=headers) # print(resp.text) html = resp.content.decode('utf-8') soup = BeautifulSoup(html, 'lxml') # 車名 name = list(soup.find('div', class_='product-textbox').find('h1').stripped_strings) name = ''.join(name) # print(name) # 上牌時間 是圖片 time =soup.find('div', class_='product-textbox').find('ul',class_='assort').find('li',class_='one').find('img')['src'] # print(time) # 表顯里程 course = list(soup.find('div', class_='product-textbox').find('ul',class_='assort').find('li',class_='two').stripped_strings) # print(course) # 排量 number = list(soup.find('div', class_='product-textbox').find('ul',class_='assort').find('li',class_='three').stripped_strings) # print(number) # 變速箱 speed = list(soup.find('div', class_='product-textbox').find('ul',class_='assort').find('li',class_='last').stripped_strings) # print(speed) # 全款價 price = list(soup.find('div', class_='product-textbox').find('div',class_='pricebox').find('div',class_='price-main').stripped_strings) print(price) f.write('{},{},{},{},{},{}\n'.format(name,time,''.join(course),''.join(number),''.join(speed),''.join(price))) def main(): base_url='https://www.guazi.com/zhangzhou/buy/' detail_urls = get_detail_urls(base_url) with open('guazi.csv','a',encoding='utf-8') as f: for detail_url in detail_urls: parse_detail_url(detail_url,f) if __name__ == '__main__': main();