items.pyphp
import scrapy class InfoItem(scrapy.Item): url = scrapy.Field() url_object_id = scrapy.Field() small_image = scrapy.Field() small_image_path = scrapy.Field() big_image = scrapy.Field() big_image_path = scrapy.Field() code = scrapy.Field() date = scrapy.Field() lengths = scrapy.Field() author = scrapy.Field() cate = scrapy.Field() av_artor = scrapy.Field()
spider/jxxx.pycss
# -*- coding: utf-8 -*- import scrapy from urllib import parse from scrapy.http import Request from JaSpider.items import InfoItem from JaSpider.utils.common import get_md5 class JxxxSpider(scrapy.Spider): name = 'jxxx' allowed_domains = ['www.jxxx.com'] start_urls = ['http://www.jxxx.com/cn/vl_update.php'] def parse(self, response): for i in response.css('.video'): small_image = i.css('img::attr(src)').extract_first() # 小封面圖的爬取,後面經過meta傳到parse_info中 link = i.css('a::attr(href)').extract_first() # 詳情頁的url爬取 real_url = parse.urljoin(response.url, link) # 詳情頁的完整地址 yield Request(url=real_url, meta={'small_image': small_image}, callback=self.parse_info) # 下一頁的爬取與請求 next_url = response.css('.page_selector .page.next::attr(href)').extract_first() perfect_next_url = parse.urljoin(response.url, next_url) if next_url: yield Request(url=perfect_next_url, callback=self.parse) def parse_info(self, response): small_image = "http:"+response.meta['small_image'] big_image = "http:"+response.xpath('//div[@id="video_jacket"]/img/@src').extract_first() code = response.css('#video_id .text::text').extract_first() date = response.css('#video_date .text::text').extract_first() lengths = response.css('#video_length .text::text').extract_first() author = response.css('#video_director .director a::text').extract_first() if response.css('#video_director .director a::text').extract_first() else "不明" cate = ','.join([i.css('a::text').extract_first() for i in response.css('#video_genres .genre') if i.css('a::text').extract_first()]) av_artor = ','.join([i.css('a::text').extract_first() for i in response.css('.star') if i.css('a::text').extract_first()]) # print("http:"+small_image) info_item = InfoItem() info_item['url'] = response.url info_item['url_object_id'] = get_md5(response.url) info_item['small_image'] = small_image info_item['big_image'] = [big_image] info_item['code'] = code info_item['date'] = date info_item['lengths'] = lengths info_item['author'] = author info_item['cate'] = cate info_item['av_artor'] = av_artor yield info_item
打開pipeline功能 settings.pydom
注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
spider/jxxx.pyscrapy
如要進一步定製功能
settings.pyide
pipeline.pyurl
補充
新建utils/common.pyspa
import hashlib def get_md5(url): if isinstance(url, str): url = url.encode("utf-8") m = hashlib.md5() m.update(url) return m.hexdigest() if __name__ == "__main__": a = get_md5('http://www.haddu.com') print(a)