1.本次代碼是在python2上運行經過的,python3的最需改2行代碼,用到其它python模塊html
2.爬取目標網站,個人博客:https://home.cnblogs.com/u/yoyoketang
爬取內容:爬個人博客的全部粉絲的名稱,並保存到txtpython
3.因爲博客園的登陸是須要人機驗證的,因此是沒法直接用帳號密碼登陸,需藉助selenium登陸web
1.大前提:先手工操做瀏覽器,登陸個人博客,並記住密碼
(保證關掉瀏覽器後,下次打開瀏覽器訪問個人博客時候是登陸狀態)
2.selenium默認啓動瀏覽器是一個空的配置,默認不加載配置緩存文件,這裏先得找到對應瀏覽器的配置文件地址,以火狐瀏覽器爲例
3.使用driver.get_cookies()方法獲取瀏覽器的cookies瀏覽器
# coding:utf-8 import requests from selenium import webdriver from bs4 import BeautifulSoup import re import time # firefox瀏覽器配置文件地址 profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default' # 加載配置 profile = webdriver.FirefoxProfile(profile_directory) # 啓動瀏覽器配置 driver = webdriver.Firefox(profile) driver.get("https://home.cnblogs.com/u/yoyoketang/followers/") time.sleep(3) cookies = driver.get_cookies() # 獲取瀏覽器cookies print(cookies) driver.quit()
(注:要是這裏腳本啓動瀏覽器後,打開的博客頁面是未登陸的,後面內容都不用看了,先檢查配置文件是否是寫錯了)緩存
1.瀏覽器的cookies獲取到後,接下來用requests去建一個session,在session裏添加登陸成功後的cookiescookie
s = requests.session() # 新建session # 添加cookies到CookieJar c = requests.cookies.RequestsCookieJar() for i in cookies: c.set(i["name"], i['value']) s.cookies.update(c) # 更新session裏cookies
1.因爲個人粉絲的數據是分頁展現的,這裏一次只能請求到45個,因此先獲取粉絲總數,而後計算出總的頁數session
# 發請求 r1 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers") soup = BeautifulSoup(r1.content, "html.parser") # 抓取個人粉絲數 fensinub = soup.find_all(class_="current_nav") print fensinub[0].string num = re.findall(u"個人粉絲\((.+?)\)", fensinub[0].string) print u"個人粉絲數量:%s"%str(num[0]) # 計算有多少頁,每頁45條 ye = int(int(num[0])/45)+1 print u"總共分頁數:%s"%str(ye)
# 抓取第一頁的數據 fensi = soup.find_all(class_="avatar_name") for i in fensi: name = i.string.replace("\n", "").replace(" ","") print name with open("name.txt", "a") as f: # 追加寫入 f.write(name.encode("utf-8")+"\n") # 抓第二頁後的數據 for i in range(2, ye+1): r2 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers?page=%s"%str(i)) soup = BeautifulSoup(r1.content, "html.parser") # 抓取個人粉絲數 fensi = soup.find_all(class_="avatar_name") for i in fensi: name = i.string.replace("\n", "").replace(" ","") print name with open("name.txt", "a") as f: # 追加寫入 f.write(name.encode("utf-8")+"\n")
# coding:utf-8 import requests from selenium import webdriver from bs4 import BeautifulSoup import re import time # firefox瀏覽器配置文件地址 profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default' s = requests.session() # 新建session url = "https://home.cnblogs.com/u/yoyoketang" def get_cookies(url): '''啓動selenium獲取登陸的cookies''' try: # 加載配置 profile = webdriver.FirefoxProfile(profile_directory) # 啓動瀏覽器配置 driver = webdriver.Firefox(profile) driver.get(url+"/followers") time.sleep(3) cookies = driver.get_cookies() # 獲取瀏覽器cookies print(cookies) driver.quit() return cookies except Exception as msg: print(u"啓動瀏覽器報錯了:%s" %str(msg)) def add_cookies(cookies): '''往session添加cookies''' try: # 添加cookies到CookieJar c = requests.cookies.RequestsCookieJar() for i in cookies: c.set(i["name"], i['value']) s.cookies.update(c) # 更新session裏cookies except Exception as msg: print(u"添加cookies的時候報錯了:%s" % str(msg)) def get_ye_nub(url): '''獲取粉絲的頁面數量''' try: # 發請求 r1 = s.get(url+"/relation/followers") soup = BeautifulSoup(r1.content, "html.parser") # 抓取個人粉絲數 fensinub = soup.find_all(class_="current_nav") print(fensinub[0].string) num = re.findall(u"個人粉絲\((.+?)\)", fensinub[0].string) print(u"個人粉絲數量:%s"%str(num[0])) # 計算有多少頁,每頁45條 ye = int(int(num[0])/45)+1 print(u"總共分頁數:%s"%str(ye)) return ye except Exception as msg: print(u"獲取粉絲頁數報錯了,默認返回數量1 :%s"%str(msg)) return 1 def save_name(nub): '''抓取頁面的粉絲名稱''' try: # 抓取第一頁的數據 if nub <= 1: url_page = url+"/relation/followers" else: url_page = url+"/relation/followers?page=%s" % str(nub) print(u"正在抓取的頁面:%s" %url_page) r2 = s.get(url_page, verify=False) soup = BeautifulSoup(r2.content, "html.parser") fensi = soup.find_all(class_="avatar_name") for i in fensi: name = i.string.replace("\n", "").replace(" ","") print(name) with open("name.txt", "a") as f: # 追加寫入 f.write(name.encode("utf-8")+"\n") # python3的改爲下面這兩行 # with open("name.txt", "a", encoding="utf-8") as f: # 追加寫入 # f.write(name+"\n") except Exception as msg: print(u"抓取粉絲名稱過程當中報錯了 :%s"%str(msg)) if __name__ == "__main__": cookies = get_cookies(url) add_cookies(cookies) n = get_ye_nub(url) for i in list(range(1, n+1)): save_name(i)
---------------------------------python接口自動化完整版-------------------------網站
全書購買地址 https://yuedu.baidu.com/ebook/585ab168302b3169a45177232f60ddccda38e695ui
做者:上海-悠悠 QQ交流羣:588402570url
也能夠關注下個人我的公衆號: