python 爬蟲簡單案例

時間 2019-12-08

原文原文鏈接

import urllib.request;
import re;

'''
爬取指定地址的頁面內容
'''
def getHtmlCode(url):
    page = urllib.request.urlopen(url)
    htmlCode = page.read()
    return htmlCode.decode('utf-8')

# htmlCon = getHtml("https://tieba.baidu.com/p/1753935195")
# htmlCon = htmlCon.decode('utf-8')
# pageFile = open("xh.txt", 'w')
# pageFile.write(htmlCon)
# pageFile.close()

'''
獲取頁面內的全部圖片並下載到本地
'''
def getImg(htmlCode):
    reg = r'src="(.+?\.jpg)" width'
    regImg = re.compile(reg)
    imgList = regImg.findall(htmlCode)
    x = 0
    for img in imgList:
        urllib.request.urlretrieve(img, '%s.jpg' % x)
        x += 1

# htmlCode = getHtmlCode("https://tieba.baidu.com/p/1753935195")
# htmlCode = htmlCode.decode('utf-8')

print(u'---------網頁圖片抓取------------')
print(u'請輸入url:')
url = input()
if url:
    pass
else:
    print(u'---------沒有輸入地址，使用默認地址。--------')
    url = "https://tieba.baidu.com/p/1753935195"

print(u'-------正在抓取網頁----------')
htmlCode = getHtmlCode(url);

print(u'-------正在下載圖片---------')
getImg(htmlCode);

print(u'-------下載圖片完成-------')
input('Press Enter to exit')
print('hello world')

學習來源：https://www.cnblogs.com/Axi8/p/5757270.html 貼吧圖片爬取html

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。