需求:爬取投訴帖子的名稱、帖子的url、帖子的標題,和帖子裏的內容。php
1.規則爬蟲--scrapy genspider -t crawl Question wz.sun0769.comhtml
**Question .pypython
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from Dongguan.items import DongguanItem class QuestionSpider(CrawlSpider): name = 'Question' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0'] rules = ( # Rule規則裏面若是沒有寫Rule,默認是深度爬取 # 因此帖子的頁面的數據 Rule(LinkExtractor(allow=r'type=4'), follow=True), # 下一頁的匹配 Rule(LinkExtractor(allow=r'question/\d+/\d+.shtml'), process_links="handle_links", callback='parse_item', follow=True), ) # 把有錯誤的連接,能夠修改過來,再去請求 def handle_links(self, links): for link in links: print("link====", link) return links # 帖子的詳細信息 def parse_item(self, response): item = DongguanItem() # 帖子連接 url = response.url title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract() if len(title_number) > 0: title_number = title_number[0] # 編號:191166 # 帖子的編號 number = title_number.split("\xa0\xa0")[1] number = number.split(":")[1] # 帖子標題 title = title_number.split("\xa0\xa0")[0] title = title.split(":")[1] item["title"] = title item["number"] = number content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract() # 把列表使用「」連接變成字符串 content = "".join(content).strip() item["url"] = url item["content"] = content yield item
2.Spider版爬蟲---scrapy genspider Question2 wz.sun0769.comjson
**Question2 .pydom
import scrapy from Dongguan.items import DongguanItem class Question2Spider(scrapy.Spider): name = 'Question2' allowed_domains = ['wz.sun0769.com'] # 偏移 offset = 0 url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page=" start_urls = [url + str(offset)] # 就是帖子具體的內容了 def process_item(self, response): item = DongguanItem() # 帖子連接 url = response.url title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract() if len(title_number) > 0: title_number = title_number[0] # 編號:191166 # 帖子的編號 number = title_number.split("\xa0\xa0")[1] number = number.split(":")[1] # 帖子標題 title = title_number.split("\xa0\xa0")[0] title = title.split(":")[1] item["title"] = title item["number"] = number content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract() # 把列表使用「」連接變成字符串 content = "".join(content).strip() item["url"] = url item["content"] = content yield item def parse(self, response): # 獲得某一頁的因此的帖子的連接 current_page_link = response.xpath('//a[@class="news14"]/@href').extract() print(current_page_link) for link in current_page_link: # 添加具體的帖子連接,讓其幫我請求 yield scrapy.Request(link, callback=self.process_item) # 拼接下一頁 if self.offset < 93630: self.offset += 30 # 下一頁的連接 new_url = self.url + str(self.offset) yield scrapy.Request(new_url, callback=self.parse)
3.CrawlSpider(規則爬蟲)和Spider版爬蟲通用的----pipelines.pyscrapy
import json class DongguanPipeline(object): def open_spider(self, spider): # 建立文件 self.file = open(spider.name + ".json", "w", encoding="utf-8") def process_item(self, item, spider): # python字典 python_dict = dict(item) # python的str python_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(python_str) return item def close_spider(self, spider): self.file.close()
4.CrawlSpider(規則爬蟲)和Spider版爬蟲通用的----item.pyide
import scrapy class DongguanItem(scrapy.Item): # define the fields for your item here like: # 每一個帖子的標題 title = scrapy.Field() # 每一個帖子的編號 number = scrapy.Field() # 每一個帖子的內容 content = scrapy.Field() # 每一個帖子的連接 url = scrapy.Field()
5.CrawlSpider(規則爬蟲)和Spider版爬蟲通用的----settings.pyurl
# 爬蟲的協議
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = { 'Dongguan.pipelines.DongguanPipeline': 300, } # 設置日誌 LOG_FILE = "dongguan.log" LOG_LEVEL = "DEBUG" # 設置用戶代理 USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"