入門使用實例html
# 查看幫助信息 scrapy --help # 查看版本及組件版本信息 scrapy version -v # 建立項目/工程 scrapy startproject 項目/工程名稱 # 建立Spider(能夠建立多個,可是名稱不能相同) scrapy genspider 名稱 採集網址 scrapy genspider aaa aaa.com scrapy genspider bbb bbb.com # 列出工程中全部的Spider scrapy list # 查看採集網址在瀏覽器中的樣子(命令執行完會在瀏覽器打開) scrapy view http://www.baidu.com # 在工程中用parse解析固定的網址,通常用於測試 scrapy parse http://www.baidu.com # shell能夠不在具體工程中執行 scrapy shell # runspider 單獨執行工程中的spider文件 scrapy runspider aaaaa.py # bench執行一個基準測試 scrapy bench
# 屬性介紹 name:spider的名稱,要求惟一 allowed_domains:准許的域名 start_urls:出事urls custom_settings:個性化設置,會覆蓋全局的設置 crawler:抓取器,spider將綁定到它上面 settings:配置實例,包含工程中全部的配置變臉 logger:日誌實例 # 方法介紹 from_crawler(crawler, *args,**kwargs):類方法,用於建立spiders start_requests():生成出事的requests make_requests_from_url(url):根據URL生成一個request parse(response):用來解析網頁內容 log(message[,level,component]):用來記錄日誌,這裏使用logger屬性記錄日誌 self.logger.info("visited success") closed(reason):當spider關閉時候調用的方法
CrawlSpider # 最經常使用的spider,用於抓取普通的網頁 # 增長了兩個成員 ## rules:定義了一些抓取規則-連接怎麼跟蹤,使用哪個parse函數解析此連接 ## parse_start_url(response):解析初始化URL的相遇 XMLFeedSpider CSVFeedSpider SitemapSpider
Selector:用來解析網頁的庫有不少,好比beautifulsoup、lxml,但在scrapy裏面默認使用的是selector,相對來講也是比較好用的 # 使用 from scrapy.selector import Selector from scrapy.http import HtmlResponse # 使用test實例化 body = '<html><body><span>good</span></body></html>' Selector(text=body).xpath('//span/text()').extract() # 使用response實例化 response = HtmlResponse(url='http://example.com', body=body) Selector(response=response).xpath('//span/test()').extract() Ltems
# 建立項目 scrapy startproject tutorial # 建立Spider scrapy genspider pm25 # 編寫Items import scrapy class Pm25CityItem(scrapy.Item): city_name = scrapy.Field() #城市的名稱 home_link = scrapy.Field() #對應數據的連接地址 city_pinyin = scrapy.Field() #城市的拼音 # 完善Spider import scrapy from tutorial.items import Pm25CityItem class Pm25Spider(scrapy.Spider): name = "pm25" allowed_domains = ["pm25.in"] start_urls = [ 'http://www.pm25.in', ] def parse(self, response): sel = scrapy.Selector(response) citys = sel.xpath("//div[@class='all']/div[@class='bottom']/ul[@class='unstyled']/div[2]/li") city_items = [] for city in citys: city_item = Pm25CityItem() href = ''.join(city.xpath('a/@href').extract()).strip() city_item['city_name'] = ''.join(city.xpath('a/text()').extract()).strip().encode("UTF-8") city_item['home_link'] = 'http://www.pm25.in' + href city_item['city_pinyin'] = href.split('/')[1] city_items.append(city_item) return city_items
# 配置MySQL數據源 MYSQL_HOST = '127.0.0.1' MYSQL_DBNAME = 'test' #數據庫名字 MYSQL_USER = 'root' #數據庫帳號 MYSQL_PASSWD = '123456' #數據庫密碼 MYSQL_PORT = 3306 #數據庫端口 # 配置MySQL存儲的Pipeline ITEM_PIPELINES = { 'tutorial.pipelines.MySQLStoreDataPipeline': 300, #保存到數據庫 } # 數據的存儲 from scrapy import log from twisted.enterprise import adbapi import datetime, uuid import MySQLdb import MySQLdb.cursors class MySQLStoreDataPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbargs = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWD'], charset='utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode= True,) dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.save_city, item) query.addErrback(self.handle_error) return item #插入城市的數據到tbl_all_city中 def save_city(self, conn, item): conn.execute(""" select 1 from tbl_all_city where city_pinyin = %s """, (item['city_pinyin'],)) ret0 = conn.fetchone() if not ret0: ret1 = conn.execute(""" insert into tbl_all_city(city_pinyin, city_name, home_link) values(%s, %s, %s) """, (item['city_pinyin'], item['city_name'], item['home_link'],)) log.msg('save to tbl_all_city: %s' % ret1, level=log.INFO) #異常處理 def handle_error(self, e): log.err(e) # 執行爬蟲程序 scrapy crawl pm25