Scrapy爬蟲框架教程(一)-- Scrapy入門html
scrapy startproject dbmovie
cd dbmoive scarpy genspider dbmovie_spider movie.douban.com/top250
注意,爬蟲名不能和項目名同樣mysql
打開settings.py文件,將ROBOTSTXT_OBEY修改成False。sql
ROBOTSTXT_OBEY = False
修改User-Agentchrome
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'Accept-Encoding' : 'gzip, deflate, br', 'Cache-Control' : 'max-age=0', 'Connection' : 'keep-alive', 'Host' : 'movie.douban.com', 'Upgrade-Insecure-Requests' : '1', 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }
scrapy crawl dbmovie_spider
根據前面的分析,咱們須要抓取一共十個字段的信息,如今在items.py文件中定義item數據庫
import scrapy class DoubanItem(scrapy.Item): # 排名 ranking = scrapy.Field() # 篇名 title = scrapy.Field() # 導演 director = scrapy.Field() # 一句話描述 有的爲空 movie_desc = scrapy.Field() # 評分 rating_num = scrapy.Field() # 評價人數 people_count = scrapy.Field() # 上映時間 online_date = scrapy.Field() # 上映國家 country = scrapy.Field() # 類別 category = scrapy.Field()
這裏須要用到xpath相關知識,偷了個懶,直接用chrome插件獲取
Chrome瀏覽器獲取XPATH的方法----經過開發者工具獲取瀏覽器
def parse(self, response): item = DoubanItem() movies = response.xpath('//div[@class="item"]') for movie in movies: # 名次 item['ranking'] = movie.xpath('div[@class="pic"]/em/text()').extract()[0] # 片名 提取多個片名 titles = movie.xpath('div[@class="info"]/div[1]/a/span/text()').extract()[0] item['title'] = titles # 獲取導演信息 info_director = movie.xpath('div[2]/div[2]/p[1]/text()[1]').extract()[0].replace("\n", "").replace(" ", "").split('\xa0')[0] item['director'] = info_director # 上映日期 online_date = movie.xpath('div[2]/div[2]/p[1]/text()[2]').extract()[0].replace("\n", "").replace('\xa0', '').split("/")[0].replace(" ", "") # 製片國家 country = movie.xpath('div[2]/div[2]/p[1]/text()[2]').extract()[0].replace("\n", "").split("/")[1].replace('\xa0', '') # 影片類型 category = movie.xpath('div[2]/div[2]/p[1]/text()[2]').extract()[0].replace("\n", "").split("/")[2].replace('\xa0', '').replace(" ", "") item['online_date'] = online_date item['country'] = country item['category'] = category movie_desc = movie.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract() if len(movie_desc) != 0: # 判斷info的值是否爲空,不進行這一步有的電影信息並無會報錯或數據不全 item['movie_desc'] = movie_desc else: item['movie_desc'] = ' ' item['rating_num'] = movie.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0] item['people_count'] = movie.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()').extract()[0] yield item # 獲取下一頁 next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield scrapy.Request(next_url, callback=self.parse, dont_filter=True)
注意1064錯誤,表中字段包含mysql關鍵字致使
Scrapy入門教程之寫入數據庫app
import pymysql def dbHandle(): conn = pymysql.connect( host='localhost', user='root', passwd='pwd', db="dbmovie", charset='utf8', use_unicode=False ) return conn class DoubanPipeline(object): def process_item(self, item, spider): dbObject = dbHandle() cursor = dbObject.cursor() sql = "insert into db_info(ranking,title,director,movie_desc,rating_num,people_count,online_date,country,category) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql, (item['ranking'], item['title'], item['director'], item['movie_desc'], item['rating_num'], item['people_count'], item['online_date'], item['country'], item['category'])) dbObject.commit() except Exception as e: print(e) dbObject.rollback() return item