Python入門-編寫抓取網站圖片的爬蟲-正則表達式

時間 2019-11-12

標籤 python 入門編寫抓取網站圖片爬蟲正則表達式欄目 Python 简体版

原文原文鏈接

//生命過短我用Python！html

//Python真是讓一直用c++的村裏孩子長知識了！python

這個僅僅是一個測試，成功抓取了某網站1000多張圖片。c++

下一步要作一個大新聞大工程正則表達式

  1 #config = utf-8
  2 
  3 import urllib
  4 import urllib2
  5 import re
  6 import os
  7 
  8 global CNT
  9 CNT = 0
 10 
 11 def getHtml(url):
 12     
 13 #! /usr/bin/env python
 14     # -*- coding=utf-8 -*- 
 15     # @Author pythontab.com
 16     #url="http://pythontab.com"
 17     req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 18     'Accept':'text/html;q=0.9,*/*;q=0.8',
 19     'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 20     'Accept-Encoding':'gzip',
 21     'Connection':'close',
 22     'Referer':None #注意若是依然不能抓取的話，這裏能夠設置抓取網站的host
 23     }
 24     req_header_2 = {
 25     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0'
 26     }
 27     
 28     req_timeout = 5
 29     #status = urllib.urlopen(url).code
 30     #print status
 31     #if status != 200:
 32     #    print 'Http Error %s' % status
 33     #    return False
 34     
 35     req = urllib2.Request(url,None,req_header_2)
 36     resp = urllib2.urlopen(req,None,req_timeout)
 37     html = resp.read()
 38     return html
 39 
 40 def getAllUrl(html):
 41     reg = r'<a href="(.+)" target='
 42     theurl = re.compile(reg)
 43     url = re.findall(theurl,html)
 44     return url
 45 
 46 def getNext(html):
 47     reg = r"<a href='.+pai/(.+).html'"
 48     nxtre = re.compile(reg)
 49     nxt = re.findall(nxtre,html)
 50     return nxt[0]
 51 
 52 def getName(html):
 53     reg = r'<title>(.+)</title>'
 54     nare = re.compile(reg)
 55     name = re.findall(nare,html)
 56     return name[0]
 57 
 58 def getImg(name,html):
 59     global CNT
 60     
 61     reg = r'<img src="(.{0,80}\.jpg)" border="0"'
 62     imgre = re.compile(reg)
 63     imglist = re.findall(imgre,html)
 64     
 65     reg = r'src="(.{0,80}\.jpeg)" border'
 66     imgre = re.compile(reg)
 67     imglist.extend(re.findall(imgre,html))
 68 
 69     reg = r"<img src='(.{0,80}\.jpg)'"
 70     imgre = re.compile(reg)
 71     imglist.extend(re.findall(imgre,html))
 72 
 73     reg = r"<img src='(.{0,80}\.jepg)'"
 74     imgre = re.compile(reg)
 75     imglist.extend(re.findall(imgre,html))
 76 
 77     local = '.\%s-[%sp]' % (name,len(imglist))
 78     if os.path.exists(unicode(local,'utf-8')):
 79         return unicode(local,'utf-8')+u'was existed'
 80     
 81     os.mkdir(unicode(local,'utf-8'))
 82 
 83     x = 0  
 84     for imgurl in imglist:
 85         print imgurl
 86         urllib.urlretrieve(imgurl,unicode(local+'\%s.jpg' % x,'utf-8'))
 87         x+=1
 88         CNT+=1
 89         
 90     return unicode('%s: get %s pthoto(s)' % (name,x),'utf-8')
 91 
 92 
 93 
 94 def getAll(num):
 95     global CNT
 96     nxt = 164680
 97     while num > 0:
 98         
 99         url = '---%s.html' % nxt
100         print nxt
101         html = getHtml(url)
102         nxt -= 1
103         num -= 1
104         if html == False:
105             print 'Error'
106             continue
107         
108         print getImg(getName(html),html)
109     
110     return 'done! %s photos!' % str(CNT)
111 
112 def getAll_update(index):
113     global CNT
114     num = CNT
115     urls = getAllUrl(getHtml(index))
116     
117     for url in urls:
118         html = getHtml('---'+url)
119         print getImg(getName(html),html)
120     return 'done! %s photos!' % str(CNT-num)
121 
122 
123 #print getAll(10)
124 #html = getHtml('---')
125 #print getNext(html)
126 
127 x = 3
128 while x < 50:
129     print getAll_update('---' % x)
130     x+=1
131 
132 #print getAll_update('---')