items.pyphp
class IachinaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() COMPANY = scrapy.Field() TYPE = scrapy.Field() PRODUCT = scrapy.Field() CLAUSE = scrapy.Field() CLAUSE_URL = scrapy.Field()
iachina.pydom
# -*- coding: utf-8 -*- import scrapy from IAChina.items import IachinaItem class IachinaSpider(scrapy.Spider): name = 'iachina' allowed_domains = ['old.iachina.cn'] start_urls = ['http://old.iachina.cn/product.php?action=company&ttype=2&page={}'.format(i) for i in range(1,4)] def parse(self, response): if not response: self.log("Company Page error -- %s"%response.url) for sel in response.xpath('//div[@class="prolist"]/ul/li/a'): item = IachinaItem() item['COMPANY'] = sel.xpath('text()').extract() company_href = sel.xpath('@href').extract_first() company_url = response.urljoin(company_href) yield scrapy.Request(url=company_url,meta={'item':item},callback=self.parse_type) def parse_type(self,response): if not response: self.log("Type Page erroe -- %s"%response.url) for sel in response.xpath('//div[@class="prolist"]/ul/li/a'): item = response.meta['item'] item['TYPE'] = sel.xpath('text()').extract() type_href = sel.xpath('@href').extract_first() type_url = response.urljoin(type_href) yield scrapy.Request(url=type_url, meta={'item': item}, callback=self.parse_product) def parse_product(self,response): if not response: self.log("Product Page erroe -- %s"%response.url) for sel in response.xpath('//div[@class="prolist"]/ul/li/a'): item = response.meta['item'] item['PRODUCT'] = sel.xpath('text()').extract() product_href = sel.xpath('@href').extract_first() product_url = response.urljoin(product_href) yield scrapy.Request(url=product_url, meta={'item': item}, callback=self.parse_clause) def parse_clause(self,response): if not response: self.log("Clause Page erroe -- %s"%response.url) for sel in response.xpath('//div[@class="prolist"]/table/tr[2]/td/a'): item = response.meta['item'] item['CLAUSE'] = sel.xpath('text()').extract() clause_href = sel.xpath('@href').extract_first() item['CLAUSE_URL'] = response.urljoin(clause_href) yield item