python 爬蟲

推薦學習博客  https://cuiqingcai.com/1052.htmlhtml

第一個 登陸博文觀點 爬蟲python

# -*- coding:utf-8 -*-
import requests
import pyquery

"""
登陸博文觀點 爬蟲
"""

s = requests.Session()

header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'member.broadview.com.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/63.0.3239.132 Chrome/63.0.3239.132 Safari/537.36'
    }

mycookies = {}

def login():
    url = "http://member.broadview.com.cn/log-in"
    params = {
        'returnUrl': 'http://www.broadview.com.cn/'
    }

    login = s.get(url=url, headers=header, params=params)
    # print('----------------------------------------------------------')
    # print(login.cookies.get_dict())
    # print('----------------------------------------------------------')
    # print(login.headers)
    # print('----------------------------------------------------------')
    # with open("1.txt", "w") as file:
    #     file.write(login.text)

    pq = pyquery.PyQuery(login.text)('form')

    input = pq.eq(1).find('input')
    # with open("2.txt", "a") as file:
    #     for a in input.items():
    #         file.write(a.outer_html()+'\n')

    # 獲取form表單的信息
    datas = {}
    for a in input.items():
        datas[a.attr('name')] = a.attr('value')

    datas['Email'] = 'email'
    datas['Password'] = 'password'

    # print(datas)

    url2 = "http://member.broadview.com.cn/log-in"
    params2 = {
        'ReturnUrl': 'http://www.broadview.com.cn/'
    }

    login2 = s.post(url=url2, headers=header, data=datas, params=params2, allow_redirects=False)
    # print('----------------------------------------------------------')
    # print(login2.headers)
    # print('----------------------------------------------------------')
    # print(login2.cookies)
    # print('----------------------------------------------------------')

    return login2.cookies

def mycount():
    header['Host'] = "www.broadview.com.cn"
    login3 = s.get(url='http://www.broadview.com.cn/', headers=header, cookies=mycookies)
    # with open("login.html", "w") as file:
    #     file.write(login3.text)

    login4 = s.get(url='http://www.broadview.com.cn/user/notification', headers=header, cookies=mycookies)
    # with open("login2.html", "w") as file:
    #     file.write(login4.text)

if __name__ == '__main__':
    login()
    mycount()

 

第二個  oschina 爬蟲  爬本身的博客信息web

# -*- coding:utf-8 -*-
"""

oschina 爬蟲  爬本身的博客信息
"""
import requests
import pyquery
import hashlib
import time

sessionr = requests.Session()

mycookies = {}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.5',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.oschina.net',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'
}

def login():

    """
    密碼錯誤 登陸屢次會出現驗證碼
    因此保證本身的賬號密碼正確
    """
    datas = {
        'email': 'email',
        'pwd': hashlib.sha1("pwd".encode("utf-8")).hexdigest(),
        'verifyCode': '',
        'save_login': '0',
    }

    url = 'https://www.oschina.net/action/user/hash_login?from='

    logina = sessionr.post(url=url, headers=headers, data=datas)
    # print("------------------------------------")
    # print(logina.text)
    # print("------------------------------------")
    # print(logina.headers)
    # print("------------------------------------")
    # print(logina.cookies)
    # print("------------------------------------")

    return logina.cookies

def mainPage():
    loginb = sessionr.post(url='https://www.oschina.net/?nocache='+str(time.time()).split(".")[0], headers=headers, cookies=mycookies)
    # with open("loginb", "w") as file:
    #     file.write(loginb.text)
    return loginb.text

def myBlogHtml(mainPageText):
    # with open("main.html", "w") as file:
    #     file.write(mainPageText)

    #獲取到class爲blog的內容  也就是個人博客
    myblog = pyquery.PyQuery(mainPageText)(".blog")
    # print("---------------------------------------")
    # print(myblog.outer_html)
    # print("---------------------------------------")
    # 獲取到個人博客連接地址
    # print(myblog.find('a').attr("href"))
    # print("---------------------------------------")

    bloghtml = sessionr.get(myblog.find('a').attr("href"), headers=headers, cookies=mycookies, allow_redirects=False)

    # print("------------------------------------")
    # print(bloghtml.headers['Location'])
    # print("------------------------------------")

    # Host發生了變化
    headers['Host'] = 'my.oschina.net'
    # 須要從headers['Location']獲取訪問 本身的連接地址
    urlblog = bloghtml.headers['Location']
    #獲取到個人博客html內容  具體的就不解析了
    bloghtml = sessionr.get(url=urlblog, headers=headers, cookies=mycookies)
    with open("blog.html", "w") as file:
        file.write(bloghtml.text)

if __name__ == "__main__":
    # 登陸  獲取到cookie
    mycookies = login()
    # 得到首頁的html內容
    mainPageText = mainPage()
    # 得到個人博客的html內容
    myBlogHtml(mainPageText)
相關文章
相關標籤/搜索