# -*- coding: utf-8 -*- import scrapy from qiubai.items import QiubaiItem class QiushibaiSpider(scrapy.Spider): name = 'qiushibai' # allowed_domains = ['www.qiushibaike.com/text/'] start_urls = ['http://www.qiushibaike.com/text//'] def parse(self, response): # 建議你們使用xpath進行指定內容的解析(框架集成了xpath解析的接口) # 段子的內容和做者 div_list = response.xpath('//div[@id="content-left"]/div') # data_list = [] for div in div_list: # xpath解析到的指定內容被存儲到了Selector對象 # extract()該方法能夠將Selector對象中存儲的數據值拿到 # author = div.xpath("./div/a[2]/h2/text()").extract()[0] # extract_first() == extract()[0] author = div.xpath("./div/a[2]/h2/text()").extract_first() content = div.xpath('.//div[@class="content"]/span/text()').extract_first() # print(author, '---------------') # print(content) # data = { # "author": author, # "content": content # } # 將解析到數值的數據存儲到item對象 item = QiubaiItem() item["author"] = author item["content"] = content # 將item對象提交給管道 yield item
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QiubaiItem(scrapy.Item): # define the fields for your item here like: author = scrapy.Field() content = scrapy.Field()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class QiubaiPipeline(object): fp = None # 該方法只會在爬蟲開始運行的時候調用一次 def open_spider(self, spider): print("開始爬蟲") self.fp = open("./data_record.txt", "w", encoding="utf-8") # 該方法就能夠接受爬蟲文件中提交過來的item對象,而且對item對象中存儲的頁面數據進行持久化存儲 # 參數item就是接收到的item對象 # 每當爬蟲文件向管道提交一次item,則該方法就會被執行一次 def process_item(self, item, spider): # 取出item中的對象存儲數據 author = item["author"] content = item["content"] # 持久化存儲 self.fp.write(author + ":" + content + "\n\n\n") return item # 該方法只會在爬蟲結束時調用一次 def close_spider(self, spider): print("爬蟲結束") self.fp.close()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class QiubaiPipeline(object): conn = None # 該方法只會在爬蟲開始運行的時候調用一次 cursor = None def open_spider(self, spider): print("開始爬蟲") # self.fp = open("./data_record.txt", "w", encoding="utf-8") self.conn = pymysql.Connect(host="127.0.0.1", port=3306, user="root", password="1228", db="qiubai") # 該方法就能夠接受爬蟲文件中提交過來的item對象,而且對item對象中存儲的頁面數據進行持久化存儲 # 參數item就是接收到的item對象 # 每當爬蟲文件向管道提交一次item,則該方法就會被執行一次 def process_item(self, item, spider): # 取出item中的對象存儲數據 author = item["author"] content = item["content"] # 持久化mysql存儲 sql = "insert into data values ('%s', '%s')" % (author, content) print(sql) # self.fp.write(author + ":" + content + "\n\n\n") self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item # 該方法只會在爬蟲結束時調用一次 def close_spider(self, spider): print("爬蟲結束") # self.fp.close() self.conn.close()