零python基礎--爬蟲實踐總結

時間 2020-06-29

標籤 python 基礎爬蟲實踐總結欄目 Python 简体版

原文原文鏈接

網絡爬蟲，是一種按照必定的規則，自動地抓取萬維網信息的程序或者腳本。html

爬蟲主要應對的問題：1.http請求 2.解析html源碼 3.應對反爬機制。python

以爲爬蟲挺有意思的，剛好看到知乎有人分享的一個爬蟲小教程：https://zhuanlan.zhihu.com/p/20410446 立馬學起！chrome

主要步驟：瀏覽器

一、按照教程下載python、配置環境變量，學習使用pip命令、安裝開發ide：pycharm網絡

二、學習使用python發送請求獲取頁面app

三、使用chrome開發者工具觀察頁面結構特徵，使用beautifulsoup解析頁面dom

四、保存頁面到本地文件ide

遇到的主要問題：函數

1.python基本語法：變量、函數、循環、異常、條件語句、建立目錄、寫文件。能夠參考《Python基礎教程》工具

2.python縮進很重要，縮進決定語句分組和層次，在循環的時候尤爲看清楚。

3.編碼格式：從代碼編輯、到網頁內容、中文文件名，無處不有編碼格式的問題。能夠參考《Python編碼問題整理》

4.beautifulsoup使用。能夠參考《Python爬蟲利器二之Beautiful Soup的用法》

5.抓取規則失效，從新分析失效頁面，從新選擇頁面特徵。

實踐，用爬蟲獲取網頁上的試題（自動抓取下一頁）代碼：

# encoding=utf8 
#設置編輯源py文件的編碼格式爲utf8
import requests, sys, chardet, os, time, random, time
from bs4 import BeautifulSoup

reload(sys)  #必需要從新加載
sys.setdefaultencoding("utf8")

print sys.getdefaultencoding(), sys.getfilesystemencoding()  # utf8 mbcs:MBCS(Multi-ByteChactacterSystem,即多字節字符系統)它是編碼的一種類型,而不是某個特定編碼的名稱
path = os.getcwd() #獲取當前文件所在目錄
newPath = os.path.join(path, "Computer")
if not os.path.isdir(newPath):
    os.mkdir(newPath) #新建文件夾
destFile = unicode(newPath + "/題目.docx","utf-8) #存爲word也能夠，不事後續用office編輯後，保存的時候總須要另存爲；用unicode()後，文件名取中文名不會變成亂碼

#最多見的模擬瀏覽器，假裝headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}


def downLoadHtml(url):
    html = requests.get(url, headers=headers)
    content = html.content
    contentEn = chardet.detect(content).get("encoding", "utf-8")
    # print contentEn  #GB2312
    try:
        tranCon = content.decode(contentEn).encode(sys.getdefaultencoding())#轉換網頁內容編碼格式；消除中文亂碼
    except Exception:
        return content #用了編碼轉換，爲何仍是存在少許頁面異常？
    # print tranCon
    else:
        return tranCon


def parseHtml(url):
    # print url, "now"
    content = downLoadHtml(url)
    contentEn = chardet.detect(content).get("encoding", "utf-8")
    soup = BeautifulSoup(content, "html.parser")  # soup.name  [document] BeautifulSoup 對象表示的是一個文檔的所有內容
    # 查找下一頁url
    theUL = soup.find("ul", {"class": "con_updown"})
    theLi = theUL.find("li")
    href = theLi.find("a").get("href")
    preUrl = None
    if href:
        print href, "next"
        preUrl = href

    # 查找所需內容
    topics = []
    try:
        divCon = soup.find("div", attrs={"class": "con_nr"})
        if divCon:
            subjects = divCon.find_all("p")  # __len__屬性不是整數，而是：method-wrapper '__len__' of ResultSet object
            index = 0 #藉助index標識查找第幾個，還有別的方式？
            for res in subjects:
                #跳過不想要的導讀行內容
                if index == 0 and res.string == "【導讀】":
                    index = 1  # 跳出循環也要加1
                    continue  # 跳過 導讀
                topic = res.string  # res有子標籤及文本，就會返回None
                if topic:
                    #按須要，只留下純文本，保存到文件
                    try:
                        parsed = topic.decode(contentEn).encode("utf8")
                    except Exception:
                        topics.append("本頁面解碼有誤，請自行查看: " + url + "\n")  # '%d' %index str(index) 數字轉字符串
                        break
                    else:
                        topics.append(parsed + "\n")
                index = index + 1
            topics.append("\n")
        else:
            topics.append("本頁面查找試題有誤，請自行查看: " + url + "\n")
    except Exception:
        topics.append("本頁面解析有誤，請自行查看: " + url + "\n")

    fp = open(destFile, 'a')  # a追加寫
    fp.writelines(topics)
    fp.close()
    return preUrl


#執行.py文件的入口
if __name__ == '__main__':
    i = 0 #記錄處理了多少頁面
    next = "http://xxxxx/1.html" #起始頁面
    print "start time:", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #打印時間，看跑了多久
    print next, "start"
    while next and i < 1000:
        next = parseHtml(next)
        i = i + 1
        #sTime = random.randint(3, 8) #隨機整數 [3,8)
        #time.sleep(sTime)  # 休息：防反爬
    print "end time:", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    print "i =", i, "url:", next
    fp = open(destFile, 'a')  # a追加寫
    fp.writelines(["lastPage：" + str(next) + "\n", "total:" + str(i) + "\n"])  # None及數字：沒法和字符串用 + 拼接
    fp.close()

抓取博客內容，未完待續……

#encoding=utf8
import sys,requests,chardet
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf8")

url = "http://www.cnblogs.com/"
agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
headers={'User-Agent': agent}
data={'user': '', 'pass': ''}
syscode = sys.getdefaultencoding()
print syscode
titles = []
def getHtml(url):
     if url:
        response = requests.post(url,headers=headers,data=data)
        if response.status_code != 200:
             return None
        content = response.content
        #print content
        contentEn = chardet.detect(content).get("encoding", "utf-8")
        try:
           tranCon = content.decode(contentEn).encode(syscode)
        except Exception:
            return content 
        else:
            #print tranCon
            return tranCon
     else:
         return None


def parseHtml(html):
    if html:
        soup = BeautifulSoup(html,"html.parser")
        tags = soup.find("div",attrs={"class":"catListTag"}).find_all("a")
        for tag in tags:
            href = tag.get("href")
            titles.add(href)

def getWords():
    strs = ""
    if titles.__len__() != 0:
        for item in titles:
            strs = strs + item;
        tags = jieba.analyse.extract_tags(strs,topK=100,withWeight=True)
        for item in tags:
            print(itme[0] + "    "  + str(int(item[1]*1000)))


if __name__ == '__main__':
    html = getHtml(url)
    parseHtml(html)
    getWords

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。