一.cookie應用實例html
import urllib.request import urllib.parse '''帶着cookie進入人人網的用戶管理界面: 1.用瀏覽器登陸人人網 2.下次請求時,抓包,拿到它帶着的cookie 3.編寫代碼,帶着cookie過去 4.若是不行,帶着全部的請求信息過去(終極方案)''' url = 'http://www.renren.com/971302264/profile' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Cookie': 'anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;' ' p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; ' 'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; ' 't=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; ' 'id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; ' 'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; ' 'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0' } req = urllib.request.Request(url,headers=headers) rep = urllib.request.urlopen(req) with open('ren.html','wb') as fp: fp.write(rep.read())
2、編程登陸人人網python
import urllib.request import urllib.parse import http.cookiejar '''python登陸人人網: 1.用瀏覽器登陸並抓包 2.拿到目標url和post信息 3.帶着這些信息發請求''' '''建立這樣的打開器,登陸時會保存cookie信息到該打開器''' cj = http.cookiejar.CookieJar() #建立CookieJar對象 handler = urllib.request.HTTPCookieProcessor(cj) #建立cookie處理者 opener = urllib.request.build_opener(handler) #建立打開器 post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 ' form_data = {'email':'18404904721', 'icode':'', 'origURL':'http://www.renren.com/home', 'domain':'renren.com', 'key_id':'1', 'captcha_type':'web_login', 'password':'641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066', 'rkey':'2c3ae276413c03a1eb5159d355895bd0', 'f':'http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile'} headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',} form_data = urllib.parse.urlencode(form_data).encode() #post表單處理 req = urllib.request.Request(url=post_url,headers=headers) #建立請求對象 rep = opener.open(req,data=form_data) #發送post請求 # print(rep.read().decode()) '''進入用戶管理界面,驗證是否登陸成功''' get_url = 'http://www.renren.com/971302264/profile' req1 = urllib.request.Request(url=get_url,headers=headers) rep1 = opener.open(req1) #再次使用該打開器,裏面會帶着cookie with open('guanli.html','wb') as fp: fp.write(rep1.read())
3、正則表達式提取內容web
import re '''()子模式''' # string = '<div><span>悟空</span></div>' # '''匹配上面的字符串,標籤是對稱的''' # pattern = re.compile(r'<(\w+)><(\w+)>\w+</\2></\1>') # ret = pattern.search(string) # print(ret) '''貪婪與非貪婪''' # string = '<div>八戒</div></div></div>' # '''匹配上面的字符串,標籤是對稱的''' # pattern1 = re.compile(r'<div>.*</div>') # pattern2 = re.compile(r'<div>.*?</div>') # ret1 = pattern1.search(string) # ret2 = pattern2.search(string) # print(ret1) # print(ret2) '''re.M多行匹配''' string = '''beautiful' beach''' pattern = re.compile(r'^bea',re.M) ret = pattern.findall(string) print(ret) '''re.S單行匹配''' # string = '<div>《沁園春-雪》' \ # '北國風光,千里冰封,萬里雪飄。' \ # '望長城內外,唯餘莽莽。' \ # '大河上下,頓失滔滔。</div>' # pattern = re.compile(r'.*',re.S) # ret = pattern.search(string) # print(ret) '''re.I 單忽略大小寫''' # string = 'Life Is Short You Must Be Sexy' # pattern = re.compile(r'life is short you must be sexy',re.I) # ret = pattern.search(string) # print(ret) '''正則替換''' string = 'Life Is Short You Must Be Sexy' pattern = re.compile(r'Sexy') ret = re.sub(pattern,'sao',string) ret2 = pattern.sub('lang',string) print(ret) print(ret2) def func(a): ret = int(a.group()) return str(ret - 3) string = '最佳身高爲175cm' pattern = re.compile(r'\d+') ret2 = pattern.sub(func,string) print(ret2)
4、正則例子-爬取糗圖圖片ajax
import urllib.request import urllib.parse import re import os def create_request(url,page): post_url = url + str(page) +'/' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=post_url,headers=header) return req def download_image(content): pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>',re.S) img_list = pattern.findall(content) #print(img_list) for img_src in img_list: img_url = 'https:' + img_src #拼接圖片連接 dirname = 'qiutu' if not os.path.exists(dirname): os.mkdir(dirname) img_name = img_url.split('/')[-1] filepath = dirname + '/' + img_name urllib.request.urlretrieve(img_url,filepath) def main(): url = 'https://www.qiushibaike.com/pic/page/' start_page = int(input('起始頁碼:')) end_page = int(input('結束頁碼:')) for page in range(start_page,end_page): print('第%s頁開始下載...' %page) #建立請求 req = create_request(url,page) #發送請求,獲得內容 rep = urllib.request.urlopen(req).read().decode() #解析內容,下載圖片 download_image(rep) print('第%s頁結束下載...' % page) if __name__ == '__main__': main()
5、正則例子-爬取語錄正則表達式
import urllib.request import urllib.parse import re import os def create_request(url,page=None): if page != None: url = url + str(page) + '.html' #print(post_url) header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=url,headers=header) return req def get_content(href): request = create_request(href) content_html = urllib.request.urlopen(request).read().decode() pattern = re.compile(r'<div class="neirong">(.*?)</div>', re.S) content_list = pattern.findall(content_html) #print(content_list) pat = re.compile(r'<img .*?>') text = pat.sub('',content_list[0]) return text def parse_html(content): #正則篩選內容 pattern = re.compile(r'<h3><a href="/mingrenjingdianyulu/(\d+/\d+/\d+\.html)"><b>(.*?)</b></a></h3>', re.S) title_list = pattern.findall(content) #print(title_list) for i in title_list: href = 'http://www.yikexun.cn/mingrenjingdianyulu/' + i[0] # 拼接內容的跳轉連接 title = i[1] #向href發送請求,獲取內容 content = get_content(href) #寫入文件 string = '<!DOCTYPE html>' \ '<html lang="en">' \ '<head>' \ ' <meta charset="UTF-8">' \ ' <title>Title</title>' \ '</head>' \ '<body>' \ ' <h1>%s</h1>%s' \ '</body>' %(title,content) with open('yulu.html','a',encoding='utf8') as fp: fp.write(string) def main(): url = 'http://www.yikexun.cn/mingrenjingdianyulu/list_10_' start_page = int(input('起始頁碼:')) end_page = int(input('結束頁碼:')) for page in range(start_page,end_page+1): print('第%s頁開始下載...' %page) #建立請求 req = create_request(url,page) #發送請求,獲得內容 rep = urllib.request.urlopen(req).read().decode() #解析內容,下載圖片 parse_html(rep) print('第%s頁結束下載...' % page) if __name__ == '__main__': main()