import urllib.request; import re; ''' 爬取指定地址的頁面內容 ''' def getHtmlCode(url): page = urllib.request.urlopen(url) htmlCode = page.read() return htmlCode.decode('utf-8') # htmlCon = getHtml("https://tieba.baidu.com/p/1753935195") # htmlCon = htmlCon.decode('utf-8') # pageFile = open("xh.txt", 'w') # pageFile.write(htmlCon) # pageFile.close() ''' 獲取頁面內的全部圖片並下載到本地 ''' def getImg(htmlCode): reg = r'src="(.+?\.jpg)" width' regImg = re.compile(reg) imgList = regImg.findall(htmlCode) x = 0 for img in imgList: urllib.request.urlretrieve(img, '%s.jpg' % x) x += 1 # htmlCode = getHtmlCode("https://tieba.baidu.com/p/1753935195") # htmlCode = htmlCode.decode('utf-8') print(u'---------網頁圖片抓取------------') print(u'請輸入url:') url = input() if url: pass else: print(u'---------沒有輸入地址,使用默認地址。--------') url = "https://tieba.baidu.com/p/1753935195" print(u'-------正在抓取網頁----------') htmlCode = getHtmlCode(url); print(u'-------正在下載圖片---------') getImg(htmlCode); print(u'-------下載圖片完成-------') input('Press Enter to exit') print('hello world')
學習來源:https://www.cnblogs.com/Axi8/p/5757270.html 貼吧圖片爬取html