總結了一下scrapy爬蟲步驟:html
1.生成項目: scrapy startproject book (項目名)
2.進入項目文件夾 cd book
3.生成項目爬蟲文件 scrapy genspider jd(爬蟲文件名) jd.com(域名)
4.編寫代碼
5.執行爬蟲 scrapy crawl jd(爬蟲文件名)
複製代碼
jd.pypython
# -*- coding: utf-8 -*-
import scrapy
import copy, json
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
# 大分類列表
dt_list = response.xpath('//div[@class="mc"]/dl/dt')
for dt in dt_list:
item = {}
item["b_cate"] = dt.xpath("./a/text()").extract_first()
# 小分類列表
em_list = dt.xpath("./following-sibling::dd[1]/em")
for em in em_list:
item["s_href"] = "https:" + em.xpath("./a/@href").extract_first()
item["s_cate"] = em.xpath("./a/text()").extract_first()
yield scrapy.Request(
item["s_href"],
callback=self.parse_book_list,
meta={"item": copy.deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta["item"]
book_list = response.xpath("//div[@id='plist']//li")
for book in book_list:
img_url = book.xpath(
".//div[@class='p-img']/a/img/@src").extract_first()
if img_url is None:
img_url = book.xpath(
".//div[@class='p-img']/a/img/@data-lazy-img").extract_first()
item["book_img"] = "https:" + img_url
item["book_title"] = book.xpath(
".//div[@class='p-name']/a/em/text()").extract_first().strip()
item["book_man"] = book.xpath(
".//div[@class='p-bookdetails']//span[@class='author_type_1']/a/text()").extract()
item["book_publish_place"] = book.xpath(
".//div[@class='p-bookdetails']//span[@class='p-bi-store']/a/@title").extract_first()
skuId = book.xpath(
".//div[@class='p-operate']/a/@data-sku").extract_first()
yield scrapy.Request(
'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(skuId),
callback=self.parse_book_price_json,
meta={"item": copy.deepcopy(item)}
)
# 翻頁
next_url = response.xpath("//a[@class='pn-next']/@href").extract_first()
if next_url is not None:
yield scrapy.Request(
"https:" + next_url,
callback=self.parse_book_list,
meta={"item": copy.deepcopy(item)}
)
def parse_book_price_json(self, response):
item = response.meta["item"]
item["book_price"] = json.loads(response.body.decode())[0]["p"]
item.pop("s_href")
yield item
複製代碼
pipelines.pymysql
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class BookPipeline(object):
def open_spider(self, spider):
self.client = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='6028',
db='jd_book',
charset='utf8'
)
def process_item(self, item, spider):
print(item)
self.client.cursor().execute(
"insert into book (b_cate, s_cate, book_img, book_title, book_man, book_publish_place, book_price) values (%s, %s, %s, %s, %s, %s, %s)", \
(item["b_cate"], item["s_cate"], item["book_img"], item["book_title"], \
item["book_man"][0], item["book_publish_place"], item["book_price"]))
self.client.commit()
return item
def close_spider(self, spider):
self.client.close()
複製代碼
settings.pyweb
# -*- coding: utf-8 -*-
BOT_NAME = 'book'
SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
LOG_LEVEL = "WARNING"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 防止重複
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_UEL = "127.0.0.1:6397"
ITEM_PIPELINES = {
'book.pipelines.BookPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
複製代碼
{"error":"pdos_captcha"}
不能爬的太快,或者要弄個代理池,這不我就被封了!!!
redis