python 簡單的爬蟲

時間 2019-12-10

標籤 python 簡單爬蟲欄目 Python 简体版

原文原文鏈接

import urllib.request
import re
import ssl  # 處理https請求
import time
import os  # 建立目錄用


def get_html(url):
    page = urllib.request.urlopen(url)
    html = page.read()  # 返回的是 <class 'bytes'> 須要轉碼爲字符串類型
    html = html.decode('utf-8')  # 返回的是 <class 'str'>
    return html


reg = 'src="(.+?\.jpg)" width'  # 正則表達式
reg_img = re.compile(reg)  # 編譯一下，運行更快
ssl._create_default_https_context = ssl._create_unverified_context  # 由於爬蟲對象是https連接，導入一個ssl模塊就能夠解決問題
imglist = reg_img.findall(get_html('http://tieba.baidu.com/p/1753935195'))  # 進行匹配


def mkdir(path):
    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符號
    path = path.rstrip("\\")

    # 判斷路徑是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判斷結果
    if not isExists:
        # 若是不存在則建立目錄
        # 建立目錄操做函數
        os.makedirs(path)
        print(path + ' 建立成功')
        return True
    else:
        # 若是目錄存在則不建立，並提示目錄已存在
        print(path + ' 目錄已存在')
        return False


# 定義要建立的目錄
mkpath = "picture"
# 調用函數
picture = mkdir(mkpath)

x = 0
for img in imglist:
    urllib.request.urlretrieve(img, mkpath+'/%s.jpg' % time.time())
    x += 1

print("圖片下載完成")

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。