推薦學習博客 https://cuiqingcai.com/1052.htmlhtml
第一個 登陸博文觀點 爬蟲python
# -*- coding:utf-8 -*- import requests import pyquery """ 登陸博文觀點 爬蟲 """ s = requests.Session() header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'member.broadview.com.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/63.0.3239.132 Chrome/63.0.3239.132 Safari/537.36' } mycookies = {} def login(): url = "http://member.broadview.com.cn/log-in" params = { 'returnUrl': 'http://www.broadview.com.cn/' } login = s.get(url=url, headers=header, params=params) # print('----------------------------------------------------------') # print(login.cookies.get_dict()) # print('----------------------------------------------------------') # print(login.headers) # print('----------------------------------------------------------') # with open("1.txt", "w") as file: # file.write(login.text) pq = pyquery.PyQuery(login.text)('form') input = pq.eq(1).find('input') # with open("2.txt", "a") as file: # for a in input.items(): # file.write(a.outer_html()+'\n') # 獲取form表單的信息 datas = {} for a in input.items(): datas[a.attr('name')] = a.attr('value') datas['Email'] = 'email' datas['Password'] = 'password' # print(datas) url2 = "http://member.broadview.com.cn/log-in" params2 = { 'ReturnUrl': 'http://www.broadview.com.cn/' } login2 = s.post(url=url2, headers=header, data=datas, params=params2, allow_redirects=False) # print('----------------------------------------------------------') # print(login2.headers) # print('----------------------------------------------------------') # print(login2.cookies) # print('----------------------------------------------------------') return login2.cookies def mycount(): header['Host'] = "www.broadview.com.cn" login3 = s.get(url='http://www.broadview.com.cn/', headers=header, cookies=mycookies) # with open("login.html", "w") as file: # file.write(login3.text) login4 = s.get(url='http://www.broadview.com.cn/user/notification', headers=header, cookies=mycookies) # with open("login2.html", "w") as file: # file.write(login4.text) if __name__ == '__main__': login() mycount()
第二個 oschina 爬蟲 爬本身的博客信息web
# -*- coding:utf-8 -*- """ oschina 爬蟲 爬本身的博客信息 """ import requests import pyquery import hashlib import time sessionr = requests.Session() mycookies = {} headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.oschina.net', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0' } def login(): """ 密碼錯誤 登陸屢次會出現驗證碼 因此保證本身的賬號密碼正確 """ datas = { 'email': 'email', 'pwd': hashlib.sha1("pwd".encode("utf-8")).hexdigest(), 'verifyCode': '', 'save_login': '0', } url = 'https://www.oschina.net/action/user/hash_login?from=' logina = sessionr.post(url=url, headers=headers, data=datas) # print("------------------------------------") # print(logina.text) # print("------------------------------------") # print(logina.headers) # print("------------------------------------") # print(logina.cookies) # print("------------------------------------") return logina.cookies def mainPage(): loginb = sessionr.post(url='https://www.oschina.net/?nocache='+str(time.time()).split(".")[0], headers=headers, cookies=mycookies) # with open("loginb", "w") as file: # file.write(loginb.text) return loginb.text def myBlogHtml(mainPageText): # with open("main.html", "w") as file: # file.write(mainPageText) #獲取到class爲blog的內容 也就是個人博客 myblog = pyquery.PyQuery(mainPageText)(".blog") # print("---------------------------------------") # print(myblog.outer_html) # print("---------------------------------------") # 獲取到個人博客連接地址 # print(myblog.find('a').attr("href")) # print("---------------------------------------") bloghtml = sessionr.get(myblog.find('a').attr("href"), headers=headers, cookies=mycookies, allow_redirects=False) # print("------------------------------------") # print(bloghtml.headers['Location']) # print("------------------------------------") # Host發生了變化 headers['Host'] = 'my.oschina.net' # 須要從headers['Location']獲取訪問 本身的連接地址 urlblog = bloghtml.headers['Location'] #獲取到個人博客html內容 具體的就不解析了 bloghtml = sessionr.get(url=urlblog, headers=headers, cookies=mycookies) with open("blog.html", "w") as file: file.write(bloghtml.text) if __name__ == "__main__": # 登陸 獲取到cookie mycookies = login() # 得到首頁的html內容 mainPageText = mainPage() # 得到個人博客的html內容 myBlogHtml(mainPageText)