python爬蟲之淘寶寶貝圖片抓取

  寫在前面的話:家裏有人開淘寶店,做爲一個小的淘寶店主,常常要作的就是從別人的店鋪(固然是批發商)把圖片一張一張存下來。而後再本身作ps作好看一點,再上架。這樣存圖什麼的,挺煩人的,恰好最近在學習python,發現這東西,真心的那叫一個方便。html

  總的來講,其實也並無什麼技術含量,只是熟悉一下python的語言和正則表達式的使用。python

  主要步驟 :正則表達式

  一、固然是抓取頁面html代碼 cookie

 1 import urllib
 2 import urllib2
 3 
 4 #獲取html代碼
 5 def getHtml(url):
 6     request = urllib2.Request(url , headers = headers)
 7     try:
 8         response = urllib2.urlopen(request)
 9         html = response.read()
10         return html
11     except urllib2.URLError,e:
12         print e.reason

 

  二、分析頁面中的詳情圖片部分和主圖部分學習

   淘寶的html頁面至關的整齊,可讀性不錯。很快就能夠找到,他們的描述頁位置:descUrl  .. location.protocol = 'http:.......'ui

能夠寫一個正則表達式,提取出來 url

1 import re
2 
3 #提取描述url
4 def descUrl(html):
5     reg = r"descUrl.*?location.protocol==='http:' \? '//(.*?)'.?:"
6     desurlre = re.compile(reg,re.I)
7     desurl = re.findall(desurlre , html)
8     return desurl

    再獲取這個詳情頁地址,就能夠提取出全部的圖片地址了。spa

1 #提取全部圖片
2 def getImglist(html):
3     reg = r'src=\"(.*?)\"'
4     imgre = re.compile(reg,re.I)
5     imglist = re.findall(imgre , html)
6     return imglist

 

  三、下載圖片code

      獲取到了圖片的url後,固然就是把圖片下下來,這裏作一個指定路徑的保存方法。htm

所以再加一個建立路徑

1 #目錄是否存在,不存在則建立
2 def createDir(path):
3     if not os.path.exists(path):
4         os.makedirs(path)
5     else:
6         if os.path.isfile(path):
7             os.mkdir(path)

  保存圖片

 1 #保存全部圖片
 2 def saveImgTo(imglist , path):
 3     createDir(path)
 4     imgIndex = 1
 5     for imgurl in imglist:
 6         splist = imgurl.split('.')
 7         filetype = splist[len(splist)-1]
 8         print "saving " + imgurl
 9         try:
10             urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + '.' + filetype )
11             imgIndex += 1
12             print "==> ok!"
13         except:
14             print "==> err!!!!!!"

 

  如下爲一份完整代碼,傳入存儲路徑,保存下url.txt 中全部url的淘寶或其餘網頁圖片。新手上路,寫的很差的地方輕拍:

  1 #coding=utf-8
  2 
  3 import re
  4 import urllib
  5 import urllib2
  6 import cookielib
  7 import StringIO, gzip
  8 import os
  9 import sys
 10 
 11 headers = {
 12         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
 13 }
 14 
 15 
 16 #解壓gzip  
 17 def gzdecode(data) :  
 18     compressedstream = StringIO.StringIO(data)  
 19     gziper = gzip.GzipFile(fileobj=compressedstream)    
 20     data2 = gziper.read()   # 讀取解壓縮後數據   
 21     return data2 
 22     
 23 #獲取html代碼
 24 def getHtml(url):
 25     request = urllib2.Request(url , headers = headers)
 26     try:
 27         response = urllib2.urlopen(request)
 28         html = response.read()
 29         return html
 30     except urllib2.URLError,e:
 31         print e.reason
 32 
 33 #目錄是否存在,不存在則建立
 34 def createDir(path):
 35     if not os.path.exists(path):
 36         os.makedirs(path)
 37     else:
 38         if os.path.isfile(path):
 39             os.mkdir(path)
 40 
 41 #提取描述url
 42 def descUrl(html):
 43     reg = r"descUrl.*?location.protocol==='http:' \? '//(.*?)'.?:"
 44     desurlre = re.compile(reg,re.I)
 45     desurl = re.findall(desurlre , html)
 46     return desurl
 47 
 48 #提取全部圖片
 49 def getImglist(html):
 50     reg = r'src=\"(.*?)\"'
 51     imgre = re.compile(reg,re.I)
 52     imglist = re.findall(imgre , html)
 53     return imglist
 54 #提取主圖
 55 def getTitleImg(html, path):
 56     createDir(path)
 57     reg = r'auctionImages.*?\[(.*?)\]'
 58     imgre = re.compile(reg,re.I)
 59     titleImg = re.findall(imgre , html)
 60     titleImg = titleImg[0]
 61     imglist = titleImg.split(',')
 62     titleIndex = 1
 63     for imgurl in imglist:
 64         print "img ==== >  " + imgurl
 65         imgurl = imgurl.strip('"')
 66         imgurl = 'http:' + imgurl
 67         print imgurl
 68         splist = imgurl.split('.')
 69         filetype = splist[len(splist)-1]
 70         try:
 71                 urllib.urlretrieve(imgurl , path + "/title"+ str(titleIndex) + '.' + filetype )
 72                 titleIndex += 1
 73                 print "==> ok!"
 74         except:
 75                print "==> err!!!!!!"
 76 
 77 #保存全部圖片
 78 def saveImgTo(imglist , path):
 79     createDir(path)
 80     imgIndex = 1
 81     for imgurl in imglist:
 82         splist = imgurl.split('.')
 83         filetype = splist[len(splist)-1]
 84         print "saving " + imgurl
 85         try:
 86             urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + '.' + filetype )
 87             imgIndex += 1
 88             print "==> ok!"
 89         except:
 90             print "==> err!!!!!!"
 91 
 92 #從一個淘寶頁面,獲得詳情圖片
 93 def getTaoBaoImg(url ,savePath):
 94     html = getHtml(url)
 95     getTitleImg(html , savePath)
 96     desurl = descUrl(html)
 97     desurl = "http://" + desurl[0]
 98     print "desurl = " +  desurl
 99     print "----------------------------------------------------------"
100     #獲得淘貝詳情html
101     desHtml = getHtml(desurl)
102     imglist = getImglist(desHtml)
103     saveImgTo(imglist , savePath)
104 #-------------------------------------我是華麗的分界線 begin Other-----------------------------------------
105 #提取其餘詳情圖片列表
106 def getOtherImgurllist(html):
107     reg = r'src="(.*?)"'
108     desre = re.compile(reg,re.S)
109     imgurllist = re.findall(desre , html)
110     return imgurllist
111     
112 
113 #從其餘提取詳情圖片
114 def getOtherImg(url , savePath):
115     html = getHtml(url)
116     imglist = getOtherImgurllist(html)
117     saveImgTo(imglist , savePath)
118 
119 #提取其餘主圖
120 def getOthertitleImg(html, savePath):
121     print "todo:"
122 
123 #-------------------------------------我是華麗的分界線 end Other-----------------------------------------
124     
125 #保存原地址
126 def saveUrl(url , savePath):
127     output = open( savePath + "/url.htm" , "w")
128     output.write("""<html>
129 <head>
130 <meta http-equiv="Content-Language" content="zh-CN">
131 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312">
132 <meta http-equiv="refresh" content="0.1;url=""" + url + """\">
133 <title></title>
134 </head>
135 <body>
136 </body>
137 </html>""")
138     output.close()
139 
140     
141 savepath = "img"
142 
143 input = open('url.txt', 'r')
144 
145 urls = input.read( )
146 urls = urls.split('\r\n')
147 print urls
148 
149 if len(sys.argv)>1 and sys.argv[1]:
150     savepath = sys.argv[1]
151 
152 print savepath
153 
154 urlIndex = 1
155 for url in urls:
156     if len(url) < 10:
157         continue
158     urlSavePath = savepath + '/' + str(urlIndex)
159     createDir(urlSavePath)
160     saveUrl(url , urlSavePath)
161     print '*'*50
162     print url
163     if url.find('taobao') != -1:
164         getTaoBaoImg(url , urlSavePath)
165     else:
166         getOtherImg(url , urlSavePath)
167     urlIndex += 1
168 
169 print "success!"
相關文章
相關標籤/搜索