貓哥教你寫爬蟲 039--存儲數據

時間 2019-12-04

原文原文鏈接

存儲到mysql數據庫

咱們須要新的模塊 pymysql

pip install pymysql
複製代碼

建立數據庫book_store

create database book_store character set utf8;
複製代碼

建立category分類表, 包含id和分類名

use book_store;
create table category(
    id int primary key auto_increment, 
    name varchar(255) not null
);
複製代碼

建立數據表book, 包含id,分類id,圖書名,價格, 還有外鍵

create table book(
	id int primary key auto_increment,
    cid int not null,
    title varchar(200) not null,
    price decimal(10,2) not null,
    foreign key(cid) references category(id)
);
複製代碼

mysql> desc category;
+-------+--------------+------+-----+---------+----------------+
| Field | Type         | Null | Key | Default | Extra          |
+-------+--------------+------+-----+---------+----------------+
| id    | int(11)      | NO   | PRI | NULL    | auto_increment |
| name  | varchar(255) | NO   |     | NULL    |                |
+-------+--------------+------+-----+---------+----------------+
mysql> desc book;
+-------+---------------+------+-----+---------+----------------+
| Field | Type          | Null | Key | Default | Extra          |
+-------+---------------+------+-----+---------+----------------+
| id    | int(11)       | NO   | PRI | NULL    | auto_increment |
| cid   | int(11)       | NO   | MUL | NULL    |                |
| title | varchar(200)  | NO   |     | NULL    |                |
| price | decimal(10,2) | NO   |     | NULL    |                |
+-------+---------------+------+-----+---------+----------------+
4 rows in set (0.02 sec)
複製代碼

使用python把爬取的數據保存到數據庫

原來的代碼...

import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('http://books.toscrape.com/').text,'html.parser')
with open('books.txt','w',encoding='utf8') as file:
    for i in soup.find('ul',class_='nav nav-list').find('ul').find_all('li'):
        file.write(i.text.strip()+'\n')
        res = requests.get("http://books.toscrape.com/"+i.find('a')['href'])
        res.encoding='utf8'
        soup = BeautifulSoup(res.text,'html.parser')
        for j in soup.find_all('li',class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
            print(j.find('h3').find('a')['title'])
            file.write('\t"{}" {}\n'.format(j.find('h3').find('a')['title'],j.find('p',class_='price_color').text))
複製代碼

進行一下注釋和改進

import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('http://books.toscrape.com/').text,'html.parser')
for i in soup.find('ul',class_='nav nav-list').find('ul').find_all('li'):
    category = i.text.strip() # 分類名
    print(category)
    res = requests.get("http://books.toscrape.com/"+i.find('a')['href'])
    res.encoding='utf8'
    soup = BeautifulSoup(res.text,'html.parser')
    for j in soup.find_all('li',class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
        title = j.find('h3').find('a')['title'] # 圖書名
        price = j.find('p',class_='price_color').text # 圖書價格
複製代碼

研究一下pymysql 的使用方法

shockerli.net/post/python…

建立mysql鏈接

import pymysql
connection = pymysql.connect(host='localhost',
                             port=3306,
                             user='root',
                             password='root',
                             db='demo',
                             charset='utf8')
複製代碼

生成遊標, 執行sql語句...

# 獲取遊標
cursor = connection.cursor()
    
# 建立數據表
effect_row = cursor.execute(''' CREATE TABLE `users` ( `name` varchar(32) NOT NULL, `age` int(10) unsigned NOT NULL DEFAULT '0', PRIMARY KEY (`name`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''')
    
# 插入數據(元組或列表)
effect_row = cursor.execute('INSERT INTO `users` (`name`, `age`) VALUES (%s, %s)', ('mary', 18))
    
# 插入數據(字典)
info = {'name': 'fake', 'age': 15}
effect_row = cursor.execute('INSERT INTO `users` (`name`, `age`) VALUES (%(name)s, %(age)s)', info)
    
connection.commit()
複製代碼

一次性執行多條sql語句

# 獲取遊標
cursor = connection.cursor()
    
# 批量插入
effect_row = cursor.executemany(
    'INSERT INTO `users` (`name`, `age`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE age=VALUES(age)', [
        ('hello', 13),
        ('fake', 28),
    ])
    
connection.commit()
複製代碼

獲取自增id

cursor.lastrowid
複製代碼

查詢數據, 一條或者多條

# 執行查詢 SQL
cursor.execute('SELECT * FROM `users`')
# 獲取單條數據
cursor.fetchone()
# 獲取前N條數據
cursor.fetchmany(3)
# 獲取全部數據
cursor.fetchall()
複製代碼

思路分析

代碼拆解

面向對象的寫法...

import requests
from bs4 import BeautifulSoup
import pymysql
class Spider():
    def __init__(self, base_url):
        self.base_url = base_url
        self.db = Database('localhost', 'root', 'root', 'book_store')
        self.soup = BeautifulSoup(requests.get(
            self.base_url).text, 'html.parser')
    def get_category(self):
        for i in self.soup.find('ul', class_='nav nav-list').find('ul').find_all('li'):
            category = i.text.strip()  # 分類名
            self.db.add_category(category)
            self.db.book_link_dict[category] = "http://books.toscrape.com/"+i.find('a')[
                'href']
    def get_book(self):
        for cat in self.db.book_link_dict:
            res = requests.get(self.db.book_link_dict[cat])
            res.encoding = 'utf8'
            soup = BeautifulSoup(res.text, 'html.parser')
            for j in soup.find_all('li', class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
                title = j.find('h3').find('a')['title']  # 圖書名
                price = j.find('p', class_='price_color').text[1:]  # 圖書價格
                self.db.add_book(cat, title, price)
    def start(self):
        self.get_category()
        self.get_book()
class Database():
    category_dict = {}
    book_link_dict = {}
    def __init__(self, host, username, password, db):
        self.connect = pymysql.connect(
            host=host, port=3306, user=username, password=password, db=db, charset='utf8'
        )
        self.cursor = self.connect.cursor()
    def add_category(self, name):
        sql = "insert into `category`(`name`) values('{}')".format(name)
        print(sql)
        self.cursor.execute(sql)
        self.connect.commit()
        last_id = self.cursor.lastrowid
        self.category_dict[name] = last_id
        return last_id
    def add_book(self, category, title, price):
        if category in self.category_dict:
            cid = self.category_dict[category]
        else:
            cid = self.add_category(category)
        sql = "insert into `book`(`cid`,`title`,`price`) values({},{},{})".format(
            cid, repr(title), price)
        print(sql)
        self.cursor.execute(sql)
        self.connect.commit()
        return self.cursor.lastrowid
if __name__ == "__main__":
    spider = Spider('http://books.toscrape.com/')
    spider.start()
複製代碼