好久之前,爬了京東的生鮮頁面,鞋子想把代碼發出來共享html
首先簡明,爬取 商品數據 採用selenium操做chrome模擬瀏覽器動態渲染頁面+ajax加載評論git
具體的看下面的說明github
所需內容:
商品小分類名稱(蘋果,橙子等)
商品名稱(煙臺紅富士蘋果 5kg 一級鉑金大果 單果230-320g 新鮮水果)
商品總評論數量
商品好評率
評論星級
評論長度
評論點贊數量
評論回覆數量
評論文本內容
評論者等級
評論發表距抓取的天數(days)
抓取部分帶有追評的評論:追評文本內容、追評與初評相距時間
以上是此次任務的需求web
這個頁面大部分的信息都是動態渲染出來的,因此要用seleniumajax
能夠看到,要找到評論不是去常見的HXR而是JS,peoduct開頭的就是評論信息chrome
Request URL: https://sclub.jd.com/comment/...
Request Method: GET
Status Code: 200
Remote Address: 117.148.129.129:443
Referrer Policy: no-referrer-when-downgrade
在這個url中,查詢字符串中的大部分參數不是必須的json
def make_url(baseurl, page=0, score=0, productId='3756271'): data1 = { 'callback': 'fetchJSON_comment98vv7490', 'productId': productId, 'score': score, 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', # 'fold': '1', # } url = baseurl + urlencode(data1) return url
具體的能夠在代碼中體現。瀏覽器
下面我要貼代碼了,坐穩扶好,不想複製的話,能夠去個人github上下載。網絡
# https://www.jd.com/allSort.aspx import requests from pyquery import PyQuery as pq from prettyprinter import cpprint import json from urllib.parse import urlencode from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import time import csv import datetime import sys def get_ajax(url): headers = { 'referer': 'https://item.jd.com/3756271.html', # referer: https://item.jd.com/3756271.html 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } response = requests.get(url, headers=headers) return json.loads(response.text[26:-2]) def make_url(baseurl, page=0, score=0, productId='3756271'): data1 = { 'callback': 'fetchJSON_comment98vv7490', 'productId': productId, 'score': score, 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', # 'fold': '1', # } url = baseurl + urlencode(data1) return url def parse_json(rjson, url=None): for comment in rjson.get('comments'): item = {} item['url'] = url item['評論星級'] = comment.get('score') item['評論長度'] = len(comment.get('content')) item['評論點贊數量'] = comment.get('usefulVoteCount') item['評論回覆數量'] = comment.get('replyCount') item['評論文本內容'] = comment.get('content') item['評論者等級'] = comment.get('userLevelId') try: date1 = time.strptime(comment.get('creationTime'), "%Y-%m-%d %H:%M:%S") date2 = time.localtime(time.time()) date1 = datetime.datetime(date1[0], date1[1], date1[2]) date2 = datetime.datetime(date2[0], date2[1], date2[2]) item['評論發表距抓取的天數(days)'] = str((date2 - date1).days) except Exception as error: print('error is >>>', error) item['評論發表距抓取的天數(days)'] = '' if comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') == '此用戶未填寫評價內容': item['追評文本內容'] = '' else: item['追評文本內容'] = comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') try: date1 = time.strptime(comment.get('afterUserComment', {}).get('created', ''), "%Y-%m-%d %H:%M:%S") date2 = time.localtime(time.time()) date1 = datetime.datetime(date1[0], date1[1], date1[2]) date2 = datetime.datetime(date2[0], date2[1], date2[2]) item['追評與初評相距時間'] = str((date2 - date1).days) except Exception: item['追評與初評相距時間'] = '' if item['追評文本內容'] == '': item['追評與初評相距時間'] = '' yield item def save_csv_merinfo(item): with open(FILENAME_MER, 'a', encoding=ENCODING, newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames_merinfo) # writer.writeheader() writer.writerow(item) def save_csv_cominfo(item): with open(FILENAME_COM, 'a', encoding=ENCODING, newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames_cominfo) # writer.writeheader() writer.writerow(item) def get_page(url): browser.get(url) submit = wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"tab-main")]/ul/li[5]'))) time.sleep(2) for i in range(30): browser.execute_script("window.scrollBy(0,50)") time.sleep(0.1) submit.click() time.sleep(3) return browser.page_source def parse_page(html, url): page_item = {} doc = pq(html, parser='html') page_item['url'] = url page_item['商品小分類名稱'] = doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a').text() page_item['商品名稱'] = doc('div.itemInfo-wrap div.sku-name').text() page_item['商品總評論數量'] = doc('#detail > div.tab-main.large > ul > li.current > s').text().replace('(', '').replace( ')', '') page_item['商品好評率'] = doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text() ##comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div return page_item def csv_create(): with open(FILENAME_MER, 'w', encoding=ENCODING, newline='') as f: writer = csv.writer(f) writer.writerow(fieldnames_merinfo) with open(FILENAME_COM, 'w', encoding=ENCODING, newline='') as f: writer = csv.writer(f) writer.writerow(fieldnames_cominfo) def crawl_all_page_url(): global ALL_PAGE_URL browser = webdriver.Chrome() wait = WebDriverWait(browser, 20) browser.get('https://www.jd.com/allSort.aspx') wait.until(EC.presence_of_element_located( (By.XPATH, '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]'))) CASE = [] for i in range(10): # 水果 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[2]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(4): # 豬肉羊肉 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[3]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(8): # 海鮮水產 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[4]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(4): # 禽肉蛋白 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[5]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(6): # 冷凍食品 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[6]/dd/a[{}]'.format(i + 1) CASE.append(initcase) # 規則只要更改range裏面的值和dl[]裏面的值,可高度擴展 for case in CASE: print('>>>>>>>>>') submit = wait.until(EC.element_to_be_clickable( (By.XPATH, case))) submit.click() print(browser.current_url) handle = browser.current_window_handle handles = browser.window_handles for newhandle in handles: if newhandle != handle: browser.switch_to.window(newhandle) time.sleep(1.5) wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="plist"]/ul[contains(@class,"gl-warp")]'))) doc = pq(browser.page_source, parser='html') for li in list(doc('div#plist ul.gl-warp li').items())[:10]: res = 'https:' + str(li('div div.p-commit-n strong a').attr('href')).replace('#comment', '') print(res) ALL_PAGE_URL.append(res) time.sleep(1.5) browser.close() browser.switch_to.window(handle) def load_all_page_url(): global ALL_PAGE_URL with open(FILENAME_CACHE, 'r', encoding='utf-8') as f: reader = csv.reader(f) for item in reader: ALL_PAGE_URL.append(item[0]) if __name__ == '__main__': # 前期準備>>>>>>>>>> browser = webdriver.Chrome() # selenium模擬瀏覽器 wait = WebDriverWait(browser, 20) MAXINDEX = 7 # 最大請求評論頁數,爲了控制評論數量在500條左右,應該設置爲35左右,35時略大於500(網頁評論非無限下拉) # 用戶自定義配置區******************************** TIMESLEEP = 2 # 睡眠間隔 FILENAME_MER = 'merinfo_test.csv' # 商品信息的文件名 FILENAME_COM = 'cominfo_test.csv' # 評論信息的文件名 FILENAME_CACHE = 'cache.csv' ENCODING = 'UTF-8' # 保存的CSV的編碼 # ********************************************** # csv文件的字段 fieldnames_merinfo = ['url', '商品小分類名稱', '商品名稱', '商品總評論數量', '商品好評率'] fieldnames_cominfo = ['url', '評論星級', '評論長度', '評論點贊數量', '評論回覆數量', '評論文本內容', '評論者等級', '評論發表距抓取的天數(days)', '追評文本內容', '追評與初評相距時間'] # <<<<<<<<<<<<<<<<< start = time.time() # csv_create() # 重置 # 去重模塊>>> URLSET = [] # 已存在的url的集合 with open(FILENAME_MER, 'r', encoding=ENCODING) as f: reader = csv.reader(f) for res in reader: URLSET.append(res[0]) print('URLSET is', URLSET) # 爬取商品信息 ALL_PAGE_URL = [] # 全部的網頁連接 load_all_page_url() # 這兩個函數要二選一,load_all_page_url會從本地的cache.csv載入,速度更快,脫機工做,不佔用網絡 # crawl_all_page_url() # 這兩個函數要二選一,load_all_page_url會從本地的cache.csv載入,速度更快,脫機工做,不佔用網絡 for page_url in ALL_PAGE_URL: if page_url not in URLSET: URLSET.append(page_url) # 動態去重 try: html = get_page(page_url) # 請求網頁,selenium動態渲染 item_mer = parse_page(html, url=page_url) # 解析網頁,pyquery cpprint(item_mer) # 爬取評論信息,ajax Flag = 0 # 計數器 ITEMS = [] baseurl = 'https://sclub.jd.com/comment/productPageComments.action?' for score in [5, 3, 2, 1]: # 0所有評論,5追評,3好評,2中評,1差評 if score == 5: MAXINDEX_TEMP = MAXINDEX else: MAXINDEX_TEMP = int(MAXINDEX / 7) # 控制比例爲7:1:1:1 for index in range(MAXINDEX_TEMP): time.sleep(TIMESLEEP) url = make_url(baseurl, page=index, score=score, productId=''.join(list(filter(str.isdigit, page_url)))) # 構造url try: json_ = get_ajax(url) # 進行ajax請求 if len(json_.get('comments')) != 0: for item in parse_json(json_, url=page_url): # 解析json cpprint(item) ITEMS.append(item) Flag += 1 else: break except Exception as error: print('AJAX請求發生錯誤{}>>>'.format(error)) print('url is {}'.format(url)) print(str(datetime.datetime.now())) sys.exit(0) # ajax請求出錯時退出程序,確保數據完整性 # 一個網頁的商品信息和評論信息都爬取完畢時,保存數據 save_csv_merinfo(item_mer) # 保存商品信息 for item in ITEMS: # 保存評論信息 try: save_csv_cominfo(item) except Exception as error: print(error) print("保存了{}條評論".format(Flag)) except Exception as error: print('網頁請求發生錯誤{}>>>'.format(error)) print('一個網頁請求已經結束>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') # time.sleep(TIMESLEEP) end = time.time() print('總共用時{}秒'.format(end - start))
仍是寫了一些註釋的,不懂得能夠在評論中問。暫時這樣吧!app