scrapy startproject daili_ips ...... cd daili_ips/ #爬蟲名稱和domains scrapy genspider xici xicidaili.com
In [1]: import requests In [2]: r = requests.get('http://www.xicidaili.com/nn/1') In [3]: r.status_code Out[3]: 500 In [4]:
返回500, 猜想是沒有加User-Agent
致使html
In [4]: headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'} In [5]: In [5]: r = requests.get('http://www.xicidaili.com/nn/1', headers=headers) In [6]: r.status_code Out[6]: 200 In [7]:
返回正常python
USER_AGENT
的註釋# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
item定義存儲哪些字段mysql
import scrapy class DailiIpsItem(scrapy.Item): ip = scrapy.Field() port = scrapy.Field() position = scrapy.Field() type = scrapy.Field() speed = scrapy.Field() last_check_time = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from daili_ips.items import DailiIpsItem class XiciSpider(scrapy.Spider): name = "xici" allowed_domains = ["xicidaili.com"] start_urls = ( 'http://www.xicidaili.com/', ) def start_requests(self): res = [] for i in range(1, 2): url = 'http://www.xicidaili.com/nn/%d'%i req = scrapy.Request(url) # 存儲全部對應地址的請求 res.append(req) return res def parse(self, response): table = response.xpath('//table[@id="ip_list"]')[0] trs = table.xpath('//tr')[1:] #去掉標題行 items = [] for tr in trs: pre_item = DailiIpsItem() pre_item['ip'] = tr.xpath('td[2]/text()').extract()[0] pre_item['port'] = tr.xpath('td[3]/text()').extract()[0] pre_item['position'] = tr.xpath('string(td[4])').extract()[0].strip() pre_item['type'] = tr.xpath('td[6]/text()').extract()[0] pre_item['speed'] = tr.xpath('td[7]/div/@title').re('\d+\.\d*')[0] pre_item['last_check_time'] = tr.xpath('td[10]/text()').extract()[0] items.append(pre_item) return items
編寫spider的時候能夠經過命令行工具scrapy shell url
來測試要提取數據的xpath語法, 這樣更高效web
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/top ics/item-pipeline.html import MySQLdb class DailiIpsPipeline(object): # 該函數必須返回一個具備數據的dict或者item對象 def process_item(self, item, spider): DBS = spider.settings.get('DBS') con = MySQLdb.connect(**DBS) # 下面這行代碼表示設置MySQL使用的字符集爲utf8 con.set_character_set('utf8') cur = con.cursor() insert_sql = ( "insert into proxy (ip, port, position, type, speed, last_check_time) " "values (%s,%s,%s,%s,%s,%s);" ) values = (item['ip'], item['port'], item['position'], item['type'], item['speed'], item['last_check_time']) # 插入數據庫 try: cur.execute(insert_sql, values) except Exception, e: print "插入失敗: ", e con.rollback() else: con.commit() cur.close() con.close() return item return item
注意:
這裏我剛開始作的時候沒有加con.set_character_set('utf8')
這一行, 結果報錯以下sql
UnicodeEncodeError: 'latin-1' codec can't encode character
可是我在建立數據表的時候已經設置字符集爲utf8, 查資料後是MySQLdb正常狀況下會嘗試將全部的內容轉爲latin1字符集處理
因此處理方法就是,設置鏈接和遊標的charset爲你所但願的編碼shell
con = MySQLdb.connect(...) # 設置連接編碼 con.set_character_set('utf8') cur = con.cursor() # 設置遊標編碼 cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET CHARACTER_SET_CONNECTION=utf8;')
我在測試後發現僅僅設置鏈接(con)
的編碼也不會報錯, 因此上述程序並無設置遊標編碼數據庫
mysql> create table porxy( -> id int primary key auto_increment, -> ip varchar(20), -> port varchar(20), -> position varchar(20), -> type varchar(20), -> speed varchar(20), -> last_check_time varchar(20) -> )charset=utf8; Query OK, 0 rows affected (0.01 sec) mysql>
更改settings.py
文件, 取消註釋app
# Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'daili_ips.pipelines.SomePipeline': 300, #}
改成dom
ITEM_PIPELINES = { 'daili_ips.pipelines.DailiIpsPipeline': 300, }
後面的數字通常在0-1000之內, 當有多個Pipelines的時候表示執行順粗, 數字小的先執行scrapy
scrapy crawl xici