學習網站:麥子scrapy第九集python
import scrapy class XiciItem(scrapy.Item): IP=scrapy.Field() PORT=scrapy.Field() POSITION=scrapy.Field() TYPE=scrapy.Field() SPEED=scrapy.Field() LAST_CHECK_TIME=scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from XiCi.items import XiciItem class XiciSpider(scrapy.Spider): name = "xici" allowed_domains = ["xicidaili.com"] start_urls = ( 'http://www.xicidaili.com', ) def start_requests(self):#做用:生成初始的request reqs=[]#定義resqs(空集) for i in range(1,206):#設置變量:頁碼1到206 req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i) reqs.append(req)#生成的request放到resqs中 return reqs#返回reqs def parse(self, response): #提取每一行的xpath位置 ip_list=response.xpath('//table[@id="ip_list"]')#ip_list=xpath提取(table標籤下的"ip_list"屬性) trs = ip_list[0].xpath('tr')#變量trs=ip_list加入tr標籤 items=[]#定義items空集 for ip in trs[1:]:#ip的tr從[1之後開始] pre_item=XiciItem()#pre_item=加載XiCiItem() pre_item['IP'] = ip.xpath('td[3]/text()')[0].extract()#取文字 pre_item['PORT'] = ip.xpath('td[4]/text()')[0].extract()#取文字 pre_item['POSITION'] = ip.xpath('string(td[5])')[0].extract().strip() pre_item['TYPE'] = ip.xpath('td[7]/text()')[0].extract() #speed取到td的title屬性,再用正則(匹配到數字) pre_item['SPEED'] = ip.xpath( 'td[8]/div[@class="bar"]/@title').re('\d{0,2}\.\d{0,}')[0] pre_item['LAST_CHECK_TIME'] = ip.xpath('td[10]/text()')[0].extract() items.append(pre_item)#把pre_item添加到項目 return items#返回項目
scrapy crawl xici -o xici.csv