scrapy簡單爬蟲

# -*- coding: utf-8 -*-
#這只是爬蟲文件內容,使用pycharm運行,在terminal中使用命令行,要用爬蟲名字

import scrapy from insist.items import InsistItem class InsistsSpider(scrapy.Spider): name = 'insists' allowed_domains = ['itcast.cn'] start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] def parse(self, response): node_list=response.xpath("//div[@class='li_txt']") items=[] for node in node_list: #建立item字段對象,用來存儲信息 item=InsistItem()#items裏面的類 name=node.xpath("./h3/text()").extract()#extract()將xpath對象轉化爲Unicode字符串 title=node.xpath("./h4/text()").extract() info=node.xpath("./p/text()").extract() item['name']=name[0] item['title']=title[0] item['info']=info[0] items.append(item) return items #pass
相關文章
相關標籤/搜索