python爬蟲,web spider。爬取網站獲取網頁數據,並進行分析提取。html
基本模塊使用的是 urllib,urllib2,re,等模塊python
(一)基本用法,例子web
(1)進行基本GET請求,獲取網頁html
json
#!coding=utf-8 import urllib import urllib2 url = 'http://www.baidu.com/' # 獲取請求 request = urllib2.Request(url) try: # 根據request,獲得返回response response = urllib2.urlopen(request) except urllib2.HTTPError, e: if hasattr(e, 'reason'): print e.reason # 讀取response的body html = response.read() # 讀取response的headers headers = response.info()
(2)表單提交瀏覽器
#!coding=utf-8 import urllib2 import urllib post_url = '' post_data = urllib.urlencode({ 'username': 'username', 'password': 'password', }) post_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0', } request = urllib2.Request( url=post_url, data=post_data, headers=post_headers, ) response = urllib2.urlopen(request) html = response.read()
(3)cookie
#!coding=utf-8 import urllib2 import re page_num = 1 url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn='+str(page_num) myPage = urllib2.urlopen(url).read().decode('gbk') myRe = re.compile(r'class="d_post_content j_d_post_content ">(.*?)</div>', re.DOTALL) items = myRe.findall(myPage) f = open('baidu.txt', 'a+') import sys reload(sys) sys.setdefaultencoding('utf-8') i = 0 texts = [] for item in items: i += 1 print i text = item.replace('<br>', '') text.replace('\n', '').replace(' ', '') + '\n' print text f.write(text) f.close()
(4)多線程
#coding:utf-8 ''' 模擬登錄163郵箱並下載郵件內容 ''' import urllib import urllib2 import cookielib import re import time import json class Email163: header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} user = '' cookie = None sid = None mailBaseUrl='http://twebmail.mail.163.com' def __init__(self): self.cookie = cookielib.CookieJar() cookiePro = urllib2.HTTPCookieProcessor(self.cookie) urllib2.install_opener(urllib2.build_opener(cookiePro)) def login(self,user,pwd): ''' 登陸 ''' postdata = urllib.urlencode({ 'username':user, 'password':pwd, 'type':1 }) #注意版本不一樣,登陸URL也不一樣 req = urllib2.Request( url='https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid='+user+'&style=10&net=t&skinid=null', data=postdata, headers=self.header, ) res = str(urllib2.urlopen(req).read()) #print res patt = re.compile('sid=([^"]+)',re.I) patt = patt.search(res) uname = user.split('@')[0] self.user = user if patt: self.sid = patt.group(1).strip() #print self.sid print '%s Login Successful.....'%(uname) else: print '%s Login failed....'%(uname) def getInBox(self): ''' 獲取郵箱列表 ''' print '\nGet mail lists.....\n' sid = self.sid url = self.mailBaseUrl+'/jy3/list/list.do?sid='+sid+'&fid=1&fr=folder' res = urllib2.urlopen(url).read() #獲取郵件列表 mailList = [] patt = re.compile('<div\s+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)<\/a>.*?<div\s+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)<\/a>',re.I|re.S) patt = patt.findall(res) if patt==None: return mailList for i in patt: line = { 'from':i[1].decode('utf8'), 'url':self.mailBaseUrl+i[0], 'subject':i[2].decode('utf8') } mailList.append(line) return mailList def getMailMsg(self,url): ''' 下載郵件內容 ''' content='' print '\n Download.....%s\n'%(url) res = urllib2.urlopen(url).read() patt = re.compile('contentURL:"([^"]+)"',re.I) patt = patt.search(res) if patt==None: return content url = '%s%s'%(self.mailBaseUrl,patt.group(1)) time.sleep(1) res = urllib2.urlopen(url).read() Djson = json.JSONDecoder(encoding='utf8') jsonRes = Djson.decode(res) if 'resultVar' in jsonRes: content = Djson.decode(res)['resultVar'] time.sleep(3) return content ''' Demon ''' #初始化 mail163 = Email163() #登陸 mail163.login('lpe234@163.com','944898186') time.sleep(2) #獲取收件箱 elist = mail163.getInBox() #獲取郵件內容 for i in elist: print '主題:%s 來自:%s 內容:\n%s'%(i['subject'].encode('utf8'),i['from'].encode('utf8'),mail163.getMailMsg(i['url']).encode('utf8'))
(5)須要登錄的狀況併發
#1 cookie的處理 import urllib2, cookielib cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) content = urllib2.urlopen('http://XXXX').read() #2 用代理和cookie opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) #3 表單的處理 import urllib postdata=urllib.urlencode({ 'username':'XXXXX', 'password':'XXXXX', 'continueURI':'http://www.verycd.com/', 'fk':fk, 'login_submit':'登陸' }) req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata ) result = urllib2.urlopen(req).read() #4 假裝成瀏覽器訪問 headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata, headers = headers ) #5 反」反盜鏈」 headers = { 'Referer':'http://www.cnbeta.com/articles' }
(6)多線程app
from threading import Thread from Queue import Queue from time import sleep #q是任務隊列 #NUM是併發線程總數 #JOBS是有多少任務 q = Queue() NUM = 2 JOBS = 10 #具體的處理函數,負責處理單個任務 def do_somthing_using(arguments): print arguments #這個是工做進程,負責不斷從隊列取數據並處理 def working(): while True: arguments = q.get() do_somthing_using(arguments) sleep(1) q.task_done() #fork NUM個線程等待隊列 for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start() #把JOBS排入隊列 for i in range(JOBS): q.put(i) #等待全部JOBS完成 q.join()