將要經過代碼實現的是:找到其中是最新電影而且評分高於8.0的電影。html
第一步代碼實現:網絡
1 # _*_ coding:utf-8 _*_ 2 # _author:khal_Cgg 4 import requests 5 from bs4 import BeautifulSoup 6 import re 7 req = {} # 最後返回的電影列表 8 def re_func(text): # 每一次請求都會被這個函數處理分析是否過標準 9 global req 10 soup = BeautifulSoup(text,"lxml") # 建立beautifulsoup 11 table_list = soup.find_all(name="table",attrs={"class":"tbspan"}) # 找到每個電影的table 12 for one_table in table_list: # 循環每個table 13 soup2 = BeautifulSoup(str(one_table),"lxml") 14 first_a = soup2.find(name="a") # 找到的第一個a標籤就是電影分類 15 if first_a.string == "[最新電影]": # 如果最新電影則再判斷其餘條件 16 film_name = soup2.find_all(name="a")[1] 17 td = soup2.find_all(name="td")[5] 18 re_aim = td.string 19 gaga = re.findall("豆瓣評分 (\d{1}.\d{1})|IMDb評分 (\d{1}.\d{1})",re_aim) #找評分 20 if gaga: # 判斷評分是否知足必定的要求 21 douban = gaga[0][0] 22 if douban: 23 douban = float(douban) 24 if douban >= 8.0: 25 req[film_name.string] = {} 26 req[film_name.string]["豆瓣得分"] = douban # 如果知足就寫入列表 27 else: 28 imdb = gaga[0][1] 29 if imdb: 30 imdb = float(imdb) 31 if imdb >= 8.0: 32 req[film_name.string]={} 33 req[film_name.string]["IMDb得分"] = imdb 34 return req 35 base_url = "http://www.ygdy8.net/html/gndy/china/list_4_%s.html" 36 for i in range(1,20): # 循環生成url就行requests 37 print('++++++++++',i) 38 user = base_url%(i) 39 print(user) 40 r1 = requests.get(user) 41 r1.encoding="gbk" 42 re_func(r1.text) 43 print(req)
上述代碼就是隻用requests模塊最簡單的方式進行爬蟲。app
缺點:框架
第二步gevent異步優化dom
# _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re,gevent req = {} class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls,'_instance'): orig = super(Singleton,cls) cls._instance = orig.__new__(cls) return cls._instance class re_func(Singleton): def __init__(self,text): global req soup = BeautifulSoup(text,"lxml") table_list = soup.find_all(name="table",attrs={"class":"tbspan"}) for one_table in table_list: soup2 = BeautifulSoup(str(one_table),"lxml") first_a = soup2.find(name="a") if first_a.string == "[最新電影]": film_name = soup2.find_all(name="a")[1] td = soup2.find_all(name="td")[5] re_aim = td.string gaga = re.findall("豆瓣評分 (\d{1}.\d{1})|IMDb評分 (\d{1}.\d{1})",re_aim) if gaga: douban = gaga[0][0] if douban: douban = float(douban) if douban >= 8.0: req[film_name.string] = {} req[film_name.string]["豆瓣得分"] = douban else: imdb = gaga[0][1] if imdb: imdb = float(imdb) if imdb >= 8.0: req[film_name.string]={} req[film_name.string]["IMDb得分"] = imdb base_url = "http://www.ygdy8.net/html/gndy/china/list_4_%s.html" def start(i): print('++++++++++',i) user = base_url%(i) print(user) r1 = requests.get(user) r1.encoding="gbk" re_func(r1.text) threads = [gevent.spawn(start, i) for i in range(1,88)] gevent.joinall(threads) print(req)
優勢:異步
結果:顯示中國電影依舊那麼爛scrapy
下載安裝好後,選擇一個目錄在命令行中:ide
settings參數:函數
ROBOTSTXT_OBEY = False 爬蟲失敗改成假oop
DEPTH_LIMIT = 2 容許迭代深度2 只迭代一次
代碼:
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request import re from bs4 import BeautifulSoup class DyttSpider(scrapy.Spider): name = "dytt" allowed_domains = ["ygdy8.net"] start_urls = ['http://www.ygdy8.net/html/gndy/china/index.html'] visited_set = set() req = {} encoding = 'gbk' def take_page(self,page_list): req = [] for biaoqian in page_list: href = biaoqian.attrs["href"] new_rul = self.start_urls[0].replace("list_4_0.html/",href,1) req.append(new_rul) print(req) return req def re_func(self,text): # 每一次請求都會被這個函數處理分析是否過標準 soup = BeautifulSoup(text, "lxml") # 建立beautifulsoup # print(text) div_x = soup.find_all(name="div",attrs={"class","x"})[1] soupX = BeautifulSoup(str(div_x), "lxml") page_list = soupX.find_all(name="a",limit=6) url_loop = self.take_page(page_list) table_list = soup.find_all(name="table", attrs={"class": "tbspan"}) # 找到每個電影的table for one_table in table_list: # 循環每個table soup2 = BeautifulSoup(str(one_table), "lxml") first_a = soup2.find(name="a") # 找到的第一個a標籤就是電影分類 if first_a.string == "[最新電影]": # 如果最新電影則再判斷其餘條件 film_name = soup2.find_all(name="a")[1] td = soup2.find_all(name="td")[5] re_aim = td.string gaga = re.findall("豆瓣評分 (\d{1}.\d{1})|IMDb評分 (\d{1}.\d{1})", re_aim) # 找評分 if gaga: # 判斷評分是否知足必定的要求 douban = gaga[0][0] if douban: douban = float(douban) if douban >= 8.0: self.req[film_name.string] = {} self.req[film_name.string]["豆瓣得分"] = douban # 如果知足就寫入列表 print(self.req) else: imdb = gaga[0][1] if imdb: imdb = float(imdb) if imdb >= 8.0: self.req[film_name.string] = {} self.req[film_name.string]["IMDb得分"] = imdb print(self.req) return url_loop def parse(self, response): print(response.url) self.visited_set.add(response.url) print(response.body.decode('gbk')) page_list = self.re_func(response) for url in page_list: if url in self.visited_set: pass else: obj = Request(url=url, method='GET', callback=self.parse,encoding='gbk') yield obj
代碼中存在的編碼問題沒有解決
例子:
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request count = 0 class XiaohuarSpider(scrapy.Spider): name = "xiaohuar" allowed_domains = ["xiaohuar.com"] start_urls = ['http://www.xiaohuar.com/list-1-0.html'] visited_set = set() def parse(self, response): self.visited_set.add(response.url) # 1. 當前頁面的全部校花爬下來 # 獲取div而且屬性爲 class=item masonry_brick hxs = HtmlXPathSelector(response) item_list = hxs.select('//div[@class="item masonry_brick"]') for item in item_list: v = item.select('.//span[@class="price"]/text()').extract_first() print(v) global count count +=1 print(count) # 2. 在當前頁中獲取 http://www.xiaohuar.com/list-1-\d+.html, # page_list = hxs.select('//a[@href="http://www.xiaohuar.com/list-1-1.html"]') page_list = hxs.select('//a[re:test(@href,"http://www.xiaohuar.com/list-1-\d+.html")]/@href').extract() for url in page_list: if url in self.visited_set: pass else: obj = Request(url=url,method='GET',callback=self.parse) print(self.parse) return obj # return 是結束這個函數後再調用 (循環只有一次)因此limit會對其有影響。可是yield是函數暫時中止在這裏 # 再調用這個函數的時候繼續循環第一個函數的for循環 因此這是會先循環第一頁上的全部頁碼1-16。此時第一次調用才結束。因此limit3正好循環完42頁。
經過這裏來了解return和yield的差異和depth_limit的做用:
經過實驗:limit在大於等於3的時候yield能循環完成42頁的任務。可是return只能完成limit的數值的頁數。
得出結論:limit監控的是parse函數本身被調用的次數return由於是結束了函數因此在循環第二次的時候就是再次調用了函數。
因此只能爬到limit數值限制的頁碼。沒有limit的時候也能徹底爬完全部的頁碼,可是因爲屢次(迭代)調用的關係比yield慢不少。
yield不一樣的是yield住後第一次parse函數並無結束,再次進行的request並非再次調用第二個函數,而是運行了第一個函數的yield。因此直到第一個parse函數的全部page_list循環完後纔會再次調用第二個parse函數。因此limit大於等於3能循環完共42頁的數據。