在windows下 html
scrapy startproject myproject #myproject是你的項目名稱python
cd 項目名稱windows
scrapy genspider myspider 爬取域名 # myspider是你的爬蟲名稱 後跟爬取域名cookie
啓動爬蟲併發
scrapy crawl 爬蟲名app
在setting.py 中配置dom
在你的myspider.py文件編寫爬蟲scrapy
import scrapy,re,requests from ..items import PerItem class LishiSpider(scrapy.Spider): name = 'myspider' #爬蟲名
# allowed_domains = ['http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=2&start=1'] start_urls = ['http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=2&start=1'] #爬取的域名 def parse(self, response): # 標題 title = response.xpath('/html/body/li[@class="categoryem"]/div[@class="vervideo-bd"]/a//div[@class="vervideo-title"]/text()').extract() # 連接 t_url = response.xpath('/html/body/li[@class="categoryem"]/div[@class="vervideo-bd"]/a/@href').extract() # 時間 data = response.xpath('/html/body/li[@class="categoryem"]/div[@class="vervideo-bd"]/a//div[@class="cm-duration"]/text()').extract() #爬取的標題等需傳到items.py裏 for i in range(len(title)): item = PerItem() item['title'] = title[i] item['t_url'] = 'http://www.pearvideo.com/' + t_url[i] item['data'] = data[i] #yield item
print(item)
注意 :爬取的字段要跟 items.py裏的一致jsp
import scrapy class PerItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() t_url = scrapy.Field() data = scrapy.Field() shi = scrapy.Field()
最後啓動爬蟲ide
scrapy crawl myspider