目標,豆瓣讀書,html
下載頁面書籍圖片。python
import urllib.request import re #使用正則表達式 def getJpg(date): jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',date) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl,\ 'C:\\Users\\74172\\source\\repos\\Python\\spidertest1\\images\\book.douban\\%s.jpg' %sTitle) except Exception as e: print(e) finally: print('圖片%s下載操做完成' % n) def getTitle(date): titleList = re.findall(r'title=".">',date) return titleList if __name__ == '__main__': url = 'https://book.douban.com/' res = urllib.request.urlopen(url) date = res.read().decode('utf-8') date_jpg = getJpg(date) imageTitle = getTitle(date) global n n = 1 for jpginfo in date_jpg: s = re.findall(r'http.+?.jpg',str(jpginfo)) print(n,'--- url -->',str(s)[2:-2]) sTitleInfo = re.findall(r'alt=".+?."',str(jpginfo)) sTitleL = re.findall(r'".+?."',str(sTitleInfo)) sTitle = str(sTitleL)[3:-3] downLoad(s[0],sTitle,n) n = n + 1
又作了點修改,並將書名寫入txt文件中正則表達式
import urllib.request import re #使用正則表達式 def getJpg(html): jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html) jpgList = re.findall(r'http.+?.jpg',str(jpgList)) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl,\ 'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg' %sTitle) finally: print('圖片---%s----下載操做完成' % sTitle) def getTitle(html): titleList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html) titleList = re.findall(r'alt=".+?."',str(titleList)) titleList = re.findall(r'".+?."',str(titleList)) return titleList def writeTxt(imageTitle): try: #目錄創建txt文件 f = open((url[8:-5]+'.txt'),"a",encoding="utf-8") #寫入 f.write(imageTitle+'\n') finally: if f: #關閉文件 f.close() if __name__ == '__main__': url = 'https://book.douban.com/' res = urllib.request.urlopen(url) html = res.read().decode('utf-8') urlJpgs = getJpg(html) imageTitle = getTitle(html) n = 0 for urlJpg in urlJpgs: print(n,'--- url -->',urlJpg) downLoad(urlJpg,imageTitle[n][1:-1],n) writeTxt(imageTitle[n][1:-1]) n = n + 1