python小白學習之路啊

時間 2019-12-19

標籤 python 學習之路欄目 Python 简体版

原文原文鏈接

#第一個爬蟲 某貼吧頁面的圖片
#-*- coding: UTF-8 -*-
import urllib2,urllib,re
url = urllib2.urlopen("http://tieba.baidu.com/p/4304902334")
cons = url.read()
# print cons
lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),cons)
print lists
j = 0
for i in lists:
	urllib.urlretrieve(i,'./img/%s.jpg'%j)
	j+=1

# coding:utf-8
import urllib2,urllib,re,string,random
#help(string.zfill)

def baidu_tieba(url,begin_page,end_page):
	for i in range(begin_page, end_page):
		#sName = string.zfill(i,5) + '.html' #　自動填充成六位地　文件名
		#print '正在下載第' + str(i) + '個圖片'
		#f = open(sName,'w+')
		m = urllib2.urlopen(url + str(i)).read()
		#print m
		lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),m)
		#f.write(m)
		#f.close()
		#print lists
		j = random.uniform(10, 20)
		for ss in lists:
                        #print ss
			urllib.urlretrieve(ss,'./img/%s.jpg'%j)

	

# =======在這裏輸入參數＝＝＝＝＝
# 這是某百度貼吧的一個帖子的地址
'''
bdurl = 'http://tieba.baidu.com/p/2857700864?pn=' #　?pn=是本身加入的
iPostBegin = 1
iPostEnd = 10
'''
bdurl = str(raw_input(u'請輸入貼吧地址，後面加？去掉pn=的數字:\n'))
begin_page = int(raw_input(u'請輸入開始的頁數:\n'))
end_page = int(raw_input(u'請輸入終點的頁數:\n'))
#===================================
#調用
baidu_tieba(bdurl,begin_page,end_page)
'''
#url = urllib2.urlopen("http://tieba.baidu.com/p/4304902334?pn="+str(p) )
#cons = url.read()
# print cons
lists = re.findall(re.compile(r'class="BDE_Image" src="(.+?\.jpg)" size'),cons)
print lists
j = 0+10*p
for i in lists:
	urllib.urlretrieve(i,'./img/%s.jpg'%j)
	j+=1
'''

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。