以前一直是requests庫作爬蟲,此次嘗試下使用selenium作爬蟲,效率不高,可是卻沒有限制,文章是分別結合大牛的selenium爬蟲以及excel追加寫入操做而成,還有待優化,打算爬取更多信息後進行詞雲分析css
''' 爬取京東商品信息: 請求url: https://www.jd.com/ 提取商品信息: 1.商品詳情頁 2.商品名稱 3.商品價格 4.評價人數 5.商品商家 ''' # coding=UTF-8 from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import xlrd import xlwt from xlutils.copy import copy def write_excel_xls(path, sheet_name, value): index = len(value) # 獲取須要寫入數據的行數 workbook = xlwt.Workbook() # 新建一個工做簿 sheet = workbook.add_sheet(sheet_name) # 在工做簿中新建一個表格 for i in range(0, index): for j in range(0, len(value[i])): sheet.write(i, j, value[i][j]) # 像表格中寫入數據(對應的行和列) workbook.save(path) # 保存工做簿 print("xls格式表格寫入數據成功!") def write_excel_xls_append(path, value): index = len(value) # 獲取須要寫入數據的行數 workbook = xlrd.open_workbook(path) # 打開工做簿 sheets = workbook.sheet_names() # 獲取工做簿中的全部表格 worksheet = workbook.sheet_by_name(sheets[0]) # 獲取工做簿中全部表格中的的第一個表格 rows_old = worksheet.nrows # 獲取表格中已存在的數據的行數 new_workbook = copy(workbook) # 將xlrd對象拷貝轉化爲xlwt對象 new_worksheet = new_workbook.get_sheet(0) # 獲取轉化後工做簿中的第一個表格 for i in range(0, index): for j in range(0, len(value[i])): new_worksheet.write(i + rows_old, j, value[i][j]) # 追加寫入數據,注意是從i+rows_old行開始寫入 new_workbook.save(path) # 保存工做簿 print("xls格式表格【追加】寫入數據成功!") def read_excel_xls(path): workbook = xlrd.open_workbook(path) # 打開工做簿 sheets = workbook.sheet_names() # 獲取工做簿中的全部表格 worksheet = workbook.sheet_by_name(sheets[0]) # 獲取工做簿中全部表格中的的第一個表格 for i in range(0, worksheet.nrows): for j in range(0, worksheet.ncols): print(worksheet.cell_value(i, j), "\t", end="") # 逐行逐列讀取數據 print() def get_good(driver): value = [] # 經過JS控制滾輪滑動獲取全部商品信息 js_code = ''' window.scrollTo(0,5000); ''' driver.execute_script(js_code) # 執行js代碼 # 等待數據加載 time.sleep(2) # 3、查找全部商品div # good_div = driver.find_element_by_id('J_goodsList') good_list = driver.find_elements_by_class_name('gl-item') n = 1 for good in good_list: # 根據屬性選擇器查找 # 商品連接 good_url = good.find_element_by_css_selector( '.p-img a').get_attribute('href') # 商品名稱 good_name = good.find_element_by_css_selector( '.p-name em').text.replace("\n", "--") # 商品價格 good_price = good.find_element_by_class_name( 'p-price').text.replace("\n", ":") # 評價人數 good_commit = good.find_element_by_class_name( 'p-commit').text.replace("\n", " ") # good_content = f''' # 商品連接: {good_url} # 商品名稱: {good_name} # 商品價格: {good_price} # 評價人數: {good_commit} # \n # ''' # print(good_content) # with open('jd.txt', 'a', encoding='utf-8') as f: # f.write(good_content) value1 = [good_url, good_name, good_price, good_commit] value.append(value1) return value if __name__ == '__main__': good_name = input('請輸入爬取商品信息:').strip() num = int(input('請輸入要爬取的頁數:')) driver = webdriver.Chrome() driver.implicitly_wait(10) # # 1、往京東主頁發送請求 driver.get('https://www.jd.com/') # # # 2、輸入商品名稱,並回車搜索 input_tag = driver.find_element_by_id('key') input_tag.send_keys(good_name) input_tag.send_keys(Keys.ENTER) time.sleep(2) # 評論數排行 driver.find_element_by_link_text('評論數').click() time.sleep(2) book_name_xls = good_name + '.xls' sheet_name_xls = good_name value_title = [["商品連接", "商品名稱", "商品價格", "評價人數"], ] write_excel_xls(book_name_xls, sheet_name_xls, value_title) for i in range(0, num): value = get_good(driver) write_excel_xls_append(book_name_xls, value) next_tag = driver.find_element_by_class_name('pn-next') next_tag.click() time.sleep(2) read_excel_xls(book_name_xls) driver.close()