# -*- coding: utf-8 -*- # @Time : 2018/12/27 17:06 # @Author : zhangxinxin # 感謝小六,小勾, 小胖給予的幫助 # @Email : 778786617@qq.com # @Software: PyCharm import csv import requests import time import json from lxml import etree # https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=3d144eda1ae5453e80e1debdea954230 # https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0 # https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=60&click=0 class JdMessage(object): def __init__(self): self.js_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=7652027,5089253,5853579,100001172674,100000822981,7437780,7081550,7321794,100002338246,8895275,7694047,7421462,100000727128,5089275,5089267,8735304,7437564,100000503295,8514651,100000773889,7920226,100001906474,6735790,100000971366,7651931,100000982034,8790545,8051124,7479810,1861102&callback=jQuery3699627&_=1545911372294' # self.url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0' self.url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=3d144eda1ae5453e80e1debdea954230' self.index_url = 'https://www.jd.com/' self.html = '' self.js_content = '' self.content_list = [] self.headers = { # 'authority': 'search.jd.com', 'method': 'GET', 'path': '/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=59&click=0', 'scheme': 'https', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', 'cookie': '您的cookie' } def get_html(self): try: url = requests.get(self.url, headers=self.headers).content.decode(encoding='utf-8') # print(url) self.js_content = requests.get(self.js_url, headers=self.headers) self.html = etree.HTML(url) except Exception as e: print(e) def parse_html(self): p_name = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@title') # p_comment = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-commit"]/strong/a//text()') # 評論是js 加載的,不能說實時,但也是近幾天的,在返回的源碼裏並無評論數,因此須要再發一個js請求。 p_price = self.html.xpath('//li/div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()') p_shop = self.html.xpath('//div[@class="p-shop"]/span/a/text()') self.js_content = self.js_content.text.split('(')[-1].split(')')[0] jd_json = json.loads(self.js_content) # print(jd_json['CommentsCount']) comment_list = [] for data in jd_json['CommentsCount']: comment_list.append(data['CommentCountStr']) # print(comment_list) # print(p_name, p_price, p_shop) # print(p_comment_num) # print(self.js_content) self.content_list = zip(p_name, p_price, comment_list, p_shop) def save_csv(self): with open('JD_phone.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['商品名', '價格', '評論數', '店鋪名']) with open('JD_phone.csv', 'a', newline='') as f: writer = csv.writer(f) for x in self.content_list: writer.writerow(x) # 數據寫入須要時間,程序結束過快會致使數據寫入不全 time.sleep(3) print('信息寫入成功') def run(self): self.get_html() self.parse_html() self.save_csv() if __name__ == '__main__': s = JdMessage() s.run()
結果圖:html
爬取內容有限,但時間已晚,剩餘代碼之後抽時間寫。python