爬蟲二 cookie&正則

時間 2019-11-13

原文原文鏈接

一.cookie應用實例html

import urllib.request
import urllib.parse

'''帶着cookie進入人人網的用戶管理界面：
        1.用瀏覽器登陸人人網
        2.下次請求時，抓包，拿到它帶着的cookie
        3.編寫代碼，帶着cookie過去
        4.若是不行，帶着全部的請求信息過去（終極方案）'''
url = 'http://www.renren.com/971302264/profile'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Cookie': 'anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;'
                      ' p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; '
                      'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; '
                      't=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; '
                      'id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; '
                      'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; '
                      'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0'

}

req = urllib.request.Request(url,headers=headers)

rep = urllib.request.urlopen(req)

with open('ren.html','wb') as fp:
    fp.write(rep.read())

2、編程登陸人人網python

import urllib.request
import urllib.parse
import http.cookiejar

'''python登陸人人網：
        1.用瀏覽器登陸並抓包
        2.拿到目標url和post信息
        3.帶着這些信息發請求'''

'''建立這樣的打開器，登陸時會保存cookie信息到該打開器'''
cj = http.cookiejar.CookieJar()    #建立CookieJar對象
handler = urllib.request.HTTPCookieProcessor(cj)     #建立cookie處理者
opener = urllib.request.build_opener(handler)      #建立打開器

post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 '

form_data = {'email':'18404904721',
            'icode':'',
            'origURL':'http://www.renren.com/home',
            'domain':'renren.com',
            'key_id':'1',
            'captcha_type':'web_login',
            'password':'641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066',
            'rkey':'2c3ae276413c03a1eb5159d355895bd0',
            'f':'http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile'}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',}

form_data = urllib.parse.urlencode(form_data).encode()       #post表單處理

req = urllib.request.Request(url=post_url,headers=headers)     #建立請求對象

rep = opener.open(req,data=form_data)            #發送post請求

# print(rep.read().decode())

'''進入用戶管理界面，驗證是否登陸成功'''
get_url = 'http://www.renren.com/971302264/profile'

req1 = urllib.request.Request(url=get_url,headers=headers)

rep1 = opener.open(req1)     #再次使用該打開器，裏面會帶着cookie

with open('guanli.html','wb') as fp:
    fp.write(rep1.read())

3、正則表達式提取內容web

import re

'''()子模式'''
# string = '<div><span>悟空</span></div>'
# '''匹配上面的字符串，標籤是對稱的'''
# pattern = re.compile(r'<(\w+)><(\w+)>\w+</\2></\1>')
# ret = pattern.search(string)
# print(ret)

'''貪婪與非貪婪'''
# string = '<div>八戒</div></div></div>'
# '''匹配上面的字符串，標籤是對稱的'''
# pattern1 = re.compile(r'<div>.*</div>')
# pattern2 = re.compile(r'<div>.*?</div>')
# ret1 = pattern1.search(string)
# ret2 = pattern2.search(string)
# print(ret1)
# print(ret2)

'''re.M多行匹配'''
string = '''beautiful'
beach'''
pattern = re.compile(r'^bea',re.M)
ret = pattern.findall(string)
print(ret)

'''re.S單行匹配'''
# string = '<div>《沁園春-雪》' \
#          '北國風光，千里冰封，萬里雪飄。' \
#          '望長城內外，唯餘莽莽。' \
#          '大河上下，頓失滔滔。</div>'
# pattern = re.compile(r'.*',re.S)
# ret = pattern.search(string)
# print(ret)

'''re.I 單忽略大小寫'''
# string = 'Life Is Short You Must Be Sexy'
# pattern = re.compile(r'life is short you must be sexy',re.I)
# ret = pattern.search(string)
# print(ret)

'''正則替換'''
string = 'Life Is Short You Must Be Sexy'
pattern = re.compile(r'Sexy')
ret = re.sub(pattern,'sao',string)
ret2 = pattern.sub('lang',string)
print(ret)
print(ret2)

def func(a):
    ret = int(a.group())
    return str(ret - 3)
string = '最佳身高爲175cm'
pattern = re.compile(r'\d+')
ret2 = pattern.sub(func,string)
print(ret2)

4、正則例子-爬取糗圖圖片ajax

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page):
    post_url = url + str(page) +'/'
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    req = urllib.request.Request(url=post_url,headers=header)
    return req

def download_image(content):
    pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>',re.S)
    img_list = pattern.findall(content)
    #print(img_list)
    for img_src in img_list:
        img_url = 'https:' + img_src      #拼接圖片連接
        dirname = 'qiutu'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        img_name = img_url.split('/')[-1]
        filepath = dirname + '/' + img_name
        urllib.request.urlretrieve(img_url,filepath)

def main():
    url = 'https://www.qiushibaike.com/pic/page/'

    start_page = int(input('起始頁碼：'))
    end_page = int(input('結束頁碼：'))

    for page in range(start_page,end_page):
        print('第%s頁開始下載...' %page)
        #建立請求
        req = create_request(url,page)

        #發送請求,獲得內容
        rep = urllib.request.urlopen(req).read().decode()

        #解析內容，下載圖片
        download_image(rep)
        print('第%s頁結束下載...' % page)

if __name__ == '__main__':
    main()

5、正則例子-爬取語錄正則表達式

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page=None):
    if page != None:
        url = url + str(page) + '.html'
    #print(post_url)
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

    req = urllib.request.Request(url=url,headers=header)
    return req

def get_content(href):
    request = create_request(href)
    content_html = urllib.request.urlopen(request).read().decode()
    pattern = re.compile(r'<div class="neirong">(.*?)</div>', re.S)
    content_list = pattern.findall(content_html)
    #print(content_list)
    pat = re.compile(r'<img .*?>')
    text = pat.sub('',content_list[0])
    return text

def parse_html(content):

    #正則篩選內容
    pattern = re.compile(r'<h3><a href="/mingrenjingdianyulu/(\d+/\d+/\d+\.html)"><b>(.*?)</b></a></h3>', re.S)
    title_list = pattern.findall(content)
    #print(title_list)

    for i in title_list:

        href = 'http://www.yikexun.cn/mingrenjingdianyulu/' + i[0]     # 拼接內容的跳轉連接
        title = i[1]

        #向href發送請求，獲取內容
        content = get_content(href)

        #寫入文件
        string = '<!DOCTYPE html>' \
                 '<html lang="en">' \
                 '<head>' \
                 '  <meta charset="UTF-8">' \
                 '  <title>Title</title>' \
                 '</head>' \
                 '<body>' \
                 '  <h1>%s</h1>%s' \
                 '</body>' %(title,content)

        with open('yulu.html','a',encoding='utf8') as fp:
            fp.write(string)

def main():
    url = 'http://www.yikexun.cn/mingrenjingdianyulu/list_10_'

    start_page = int(input('起始頁碼：'))
    end_page = int(input('結束頁碼：'))

    for page in range(start_page,end_page+1):
        print('第%s頁開始下載...' %page)

        #建立請求
        req = create_request(url,page)

        #發送請求,獲得內容
        rep = urllib.request.urlopen(req).read().decode()

        #解析內容，下載圖片
        parse_html(rep)
        print('第%s頁結束下載...' % page)

if __name__ == '__main__':
    main()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。