scrapy簡單爬蟲

時間 2019-11-09

標籤 scrapy 簡單爬蟲欄目 Python 简体版

原文原文鏈接

# -*- coding: utf-8 -*-
#這只是爬蟲文件內容，使用pycharm運行，在terminal中使用命令行，要用爬蟲名字

import scrapy
from insist.items import InsistItem

class InsistsSpider(scrapy.Spider):
    name = 'insists'
    allowed_domains = ['itcast.cn']
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']

    def parse(self, response):
        node_list=response.xpath("//div[@class='li_txt']")
        items=[]
        for node in node_list:
            #建立item字段對象，用來存儲信息
            item=InsistItem()#items裏面的類
            name=node.xpath("./h3/text()").extract()#extract()將xpath對象轉化爲Unicode字符串
            title=node.xpath("./h4/text()").extract()
            info=node.xpath("./p/text()").extract()

            item['name']=name[0]
            item['title']=title[0]
            item['info']=info[0]
            items.append(item)
        return items
        #pass

相關標籤/搜索