#第一個爬蟲 某貼吧頁面的圖片 #-*- coding: UTF-8 -*- import urllib2,urllib,re url = urllib2.urlopen("http://tieba.baidu.com/p/4304902334") cons = url.read() # print cons lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),cons) print lists j = 0 for i in lists: urllib.urlretrieve(i,'./img/%s.jpg'%j) j+=1
# coding:utf-8 import urllib2,urllib,re,string,random #help(string.zfill) def baidu_tieba(url,begin_page,end_page): for i in range(begin_page, end_page): #sName = string.zfill(i,5) + '.html' # 自動填充成六位地 文件名 #print '正在下載第' + str(i) + '個圖片' #f = open(sName,'w+') m = urllib2.urlopen(url + str(i)).read() #print m lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),m) #f.write(m) #f.close() #print lists j = random.uniform(10, 20) for ss in lists: #print ss urllib.urlretrieve(ss,'./img/%s.jpg'%j) # =======在這裏輸入參數===== # 這是某百度貼吧的一個帖子的地址 ''' bdurl = 'http://tieba.baidu.com/p/2857700864?pn=' # ?pn=是本身加入的 iPostBegin = 1 iPostEnd = 10 ''' bdurl = str(raw_input(u'請輸入貼吧地址,後面加?去掉pn=的數字:\n')) begin_page = int(raw_input(u'請輸入開始的頁數:\n')) end_page = int(raw_input(u'請輸入終點的頁數:\n')) #=================================== #調用 baidu_tieba(bdurl,begin_page,end_page) ''' #url = urllib2.urlopen("http://tieba.baidu.com/p/4304902334?pn="+str(p) ) #cons = url.read() # print cons lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),cons) print lists j = 0+10*p for i in lists: urllib.urlretrieve(i,'./img/%s.jpg'%j) j+=1 '''