pip install -i https://pypi.douban.com/simple/ scrapy
scrapy startproject ArticleSpider
main.py是後面建立用來運行scrapy的css
cd ArticleSpider scrapy genspider jobbole blog.jobbole.com ------- ---------------- spider名 網站域名
在ArticleSpider建立main.py,能夠經過此文件運行scrapypython
from scrapy.cmdline import execute import sys import os # print(__file__) #本文件名 # print(os.path.dirname(__file__)) #父文件名 # print(os.path.abspath(os.path.dirname(__file__))) #路徑和父文件名 sys.path.append(os.path.dirname(os.path.abspath(__file__))) #設置環境,必須 execute(["scrapy", "crawl", "jobbole"])
以上運行可能在win下會報錯app
settings.pydom
在jobbole.py下scrapy
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request import re from urllib import parse class JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://blog.jobbole.com/all-posts/'] def parse(self, response): # 獲取列表頁每個item的url post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract() for post_url in post_urls: print(post_url) yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_info) # 把獲取到的url交給詳情頁的方法處理 # 獲取下一頁的url next_url = response.css('.next.page-numbers::attr(href)').extract_first() if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) # 把獲取到的下一頁的url交給本身的方法處理 """獲取詳情頁的信息""" def parse_info(self, response): # 如下都是獲取詳情頁信息 res_title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() res_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·', '').strip() res_zhan = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first() res_content = response.xpath('//div[@class="entry"]/p/text()').extract_first() res_cate_a = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract_first() res_cate_b = [i.strip() for i in res_cate_a if not i.strip().endswith('評論')] res_cate_c = ','.join(res_cate_b) res_shoucang = response.xpath('//div[@class="post-adds"]/span[2]/text()').extract_first().strip() match_obj1 = re.match('.*(\d+).*', res_shoucang) if match_obj1: res_shoucang = match_obj1.group(1) else: res_shoucang = 0 res_comment = response.xpath('//div[@class="post-adds"]/a/span/text()').extract_first().strip() match_obj2 = re.match('.*(\d+).*', res_comment) if match_obj2: res_comment = match_obj2.group(1) else: res_comment = 0