需求:爬取新浪網導航頁(http://news.sina.com.cn/guide/)全部下全部大類、小類、小類裏的子連接,以及子連接頁面的新聞內容。html
準備工做:mysql
a.安裝redis(windows或者linux)linux
b.安裝Redis Desktop Managerredis
c.scrapy-redis的安裝以及scrapy的安裝sql
d.安裝mongo數據庫
e.安裝mysqljson
建立項目命令:scrapy startproject mysinawindows
進入mysina目錄:cd mysina瀏覽器
建立spider爬到:scrapy genspider sina sina.comapp
執行運行項目腳本命令:scrapy crawl sina
1.item.py
import scrapy class SinaItem(scrapy.Item): #大標題 parent_title = scrapy.Field() #大標題對應的連接 parent_url = scrapy.Field() #小標題 sub_title = scrapy.Field() #小標題的連接 sub_url = scrapy.Field() #大標題和小標題對應的目錄 sub_file_name = scrapy.Field() #新聞相關內容 son_url = scrapy.Field() #帖子標題 head = scrapy.Field() #帖子的內容 content = scrapy.Field() #帖子最後存儲的位置 son_path = scrapy.Field() spider = scrapy.Field() url = scrapy.Field() crawled = scrapy.Field()
2.spiders/sina_info.py
import scrapy,os from scrapy_redis.spiders import RedisSpider from Sina.items import SinaItem class SinaInfoSpider(RedisSpider): name = 'sinainfospider_redis' allowed_domains = ['sina.com.cn'] # 添加起始路徑的時候:lpush myspider:start_urls 起始路徑 redis_key = 'sinainfospider:start_urls' # start_urls = ['http://news.sina.com.cn/guide/'] def parse_detail(self,response): """解析帖子的數據""" item = response.meta["item"] #帖子連接 item["son_url"] = response.url print("response.url===",response.url) heads = response.xpath('//h1[@class="main-title"]/text()|//div[@class="blkContainerSblk"]/h1[@id="artibodyTitle"]/text()').extract() head = "".join(heads) #把節點轉換成unicode編碼 contents = response.xpath('//div[@class="article"]/p/text()|//div[@id="artibody"]/p/text()').extract() content = "".join(contents) item["content"] = content item["head"] = head # print("item=====",item) yield item #解析第二層的方法 def parse_second(self,response): #獲得帖子的連接 # print("parse_second--response.url====", response.url) son_urls = response.xpath('//a/@href').extract() item = response.meta["item"] parent_url = item["parent_url"] # print("item====",item) for url in son_urls: #判斷當前的頁面的連接是否屬於對應的類別 if url.startswith(parent_url) and url.endswith(".shtml"): #請求 yield scrapy.Request(url, callback=self.parse_detail, meta={"item": item}) def parse(self, response): # print("response.url====",response.url) #因此的大標題 parent_titles = response.xpath('//h3[@class="tit02"]/a/text()').extract() # 大標題對應的因此的連接 parent_urls = response.xpath('//h3[@class="tit02"]/a/@href').extract() #全部小標題 sub_titles = response.xpath('//ul[@class="list01"]/li/a/text()').extract() #因此小標題對應的連接 sub_urls = response.xpath('//ul[@class="list01"]/li/a/@href').extract() items = [] for i in range(len(parent_titles)): #http://news.sina.com.cn/ 新聞 parent_url = parent_urls[i] parent_title = parent_titles[i] for j in range(len(sub_urls)): #http://news.sina.com.cn/world/ 國際 sub_url = sub_urls[j] sub_title = sub_titles[j] #判斷url前綴是否相同,相同就是屬於,不然不屬於 if sub_url.startswith(parent_url): #裝數據 #建立目錄 sub_file_name = "./Data/"+parent_title+"/"+sub_title if not os.path.exists(sub_file_name): #不存在就建立 os.makedirs(sub_file_name) item["parent_url"] = parent_url item["parent_title"] = parent_title item["sub_url"] = sub_url item["sub_title"] = sub_title item["sub_file_name"] = sub_file_name items.append(item) #把列表的數據取出 for item in items: sub_url = item["sub_url"] #meta={"item":item} 傳遞item引用SinaItem對象 yield scrapy.Request(sub_url,callback=self.parse_second,meta={"item":item})
3.pipelines.py
from datetime import datetime import json class ExamplePipeline(object): def process_item(self, item, spider): # 當前爬取的時間 item["crawled"] = datetime.utcnow() # 爬蟲的名稱 item["spider"] = spider.name + "_嘮叨" return item class SinaPipeline(object): def open_spider(self, spider): self.file = open(spider.name + ".json", "w", encoding="utf-8") def close_spider(self, spider): self.file.close() def process_item(self, item, spider): print("item====", item) sub_file_name = item["sub_file_name"] print("sub_file_name==", sub_file_name) content = item["content"] if len(content) > 0: file_name = item["son_url"] # 切片,從右邊查找,替換 file_name = file_name[7:file_name.rfind(".")].replace("/", "_") # './Data/新聞/國內', # './Data/新聞/國內/lslsllll.txt', file_path = sub_file_name + "/" + file_name + ".txt" with open(file_path, "w", encoding="utf-8") as f: f.write(content) item["son_path"] = file_path return item
4.settings.py
BOT_NAME = 'Sina' SPIDER_MODULES = ['Sina.spiders'] NEWSPIDER_MODULE = 'Sina.spiders' #模擬瀏覽器身份 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' #使用scrapy_redis本身的去重處理器 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #使用scrapy_redis本身調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" #爬蟲能夠暫停/開始, 從爬過的位置接着爬取 SCHEDULER_PERSIST = True #不設置的話,默認使用的是SpiderPriorityQueue #優先級隊列 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" #普通隊列 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #棧 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" # Obey robots.txt rules ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 ITEM_PIPELINES = { # scrapy默認配置 'Sina.pipelines.ExamplePipeline': 300, 'Sina.pipelines.SinaPipeline': 301, # 把數據默認添加到redis數據庫中 'scrapy_redis.pipelines.RedisPipeline': 400, } # 日誌基本 LOG_LEVEL = 'DEBUG' #配置redis數據庫信息 #redis數據庫主機--- REDIS_HOST = "127.0.0.1" #redis端口 REDIS_PORT = 6379 #下載延遲1秒 # DOWNLOAD_DELAY = 1
5.start.py
from scrapy import cmdline cmdline.execute("scrapy runspider sina_info.py".split())
6.運行start.py,的效果圖,等待指令。。。。。。
7.Redis Desktop Manager輸入如下指令
此時開始爬數據的效果圖:
8.數據保存到mongo數據庫
import json, redis, pymongo def main(): # 指定Redis數據庫信息 rediscli = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) # 指定MongoDB數據庫信息 mongocli = pymongo.MongoClient(host='localhost', port=27017) # 建立數據庫名 db = mongocli['sina'] # 建立表名 sheet = db['sina_items'] offset = 0 while True: # FIFO模式爲 blpop,LIFO模式爲 brpop,獲取鍵值 source, data = rediscli.blpop(["sinainfospider_redis:items"]) item = json.loads(data.decode("utf-8")) sheet.insert(item) offset += 1 print(offset) try: print("Processing: %s " % item) except KeyError: print("Error procesing: %s" % item) if __name__ == '__main__': main()
9.存到mysql數據庫
import redis, json, time from pymysql import connect # redis數據庫連接 redis_client = redis.StrictRedis(host="127.0.0.1", port=6379, db=0) # mysql數據庫連接 # mysql_client = connect(host="127.0.0.1", user="root", password="mysql", database="sina", port=3306, charset="uft8") mysql_client = connect(host="127.0.0.1", user="root", password="mysql", database="sina", port=3306, charset='utf8') cursor = mysql_client.cursor() i = 1 while True: print(i) time.sleep(1) source, data = redis_client.blpop(["sinainfospider_redis:items"]) item = json.loads(data.decode()) print("source===========", source) print("item===========", item) sql = "insert into sina_items(parent_url,parent_title,sub_title,sub_url,sub_file_name,son_url,head,content,crawled,spider) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" params = [item["parent_url"], item["parent_title"], item["sub_title"], item["sub_url"], item["sub_file_name"], item["son_url"], item["head"], item["content"], item["crawled"], item["spider"], ] cursor.execute(sql, params) mysql_client.commit() i += 1