Python3爬蟲數據入數據庫---把爬取到的數據存到數據庫，帶數據庫去重功能

時間 2019-11-17

標籤 python3 python 爬蟲數據數據庫功能欄目 Python 简体版

原文原文鏈接

這是python3實戰入門系列的第三篇文章，要學習這一篇須要瞭解前兩篇，要不學起來比較費勁html

下面來正式開始把咱們第一節爬取到的新聞數據保存到mysql數據中python

一，首先咱們須要鏈接數據庫

經過定義一個MySQLCommand類來配置數據庫鏈接參數，並定義一個connectMysql方法鏈接數據庫mysql

# -*- coding: utf-8 -*-
# 做者微信：2501902696
import pymysql
# 用來操做數據庫的類
class MySQLCommand(object):
    # 類的初始化
    def __init__(self):
        self.host = 'localhost'
        self.port = 3306  # 端口號
        self.user = 'root'  # 用戶名
        self.password = ""  # 密碼
        self.db = "home"  # 庫
        self.table = "home_list"  # 表

    # 連接數據庫
    def connectMysql(self):
        try:
            self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
                                        passwd=self.password, db=self.db, charset='utf8')
            self.cursor = self.conn.cursor()
        except:
            print('connect mysql error.')
複製代碼

二，鏈接完數據庫後咱們須要插入數據了

插入數據以前咱們有兩個問題sql

1，重複的數據如何去重
2，新數據的主鍵id應該從哪裏開始針對上面的兩個問題我貼出一部分代碼來看解決思路

# 插入數據，插入以前先查詢是否存在，若是存在就再也不插入
    def insertData(self, my_dict):
        table = "home_list"  # 要操做的表格
        # 注意，這裏查詢的sql語句url=' %s '中%s的先後要有空格
        sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
        res = self.cursor.execute(sqlExit)
        if res:  # res爲查詢到的數據條數若是大於0就表明數據已經存在
            print("數據已存在", res)
            return 0
        # 數據不存在才執行下面的插入操做
        try:
            cols = ', '.join(my_dict.keys())#用，分割
            values = '"," '.join(my_dict.values())
            sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
            #拼裝後的sql以下
            # INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
            try:
                result = self.cursor.execute(sql)
                insert_id = self.conn.insert_id()  # 插入成功後返回的id
                self.conn.commit()
                # 判斷是否執行成功
                if result:
                    print("插入成功", insert_id)
                    return insert_id + 1
            except pymysql.Error as e:
                # 發生錯誤時回滾
                self.conn.rollback()
                # 主鍵惟一，沒法插入
                if "key 'PRIMARY'" in e.args[1]:
                    print("數據已存在，未插入數據")
                else:
                    print("插入數據失敗，緣由 %d: %s" % (e.args[0], e.args[1]))
        except pymysql.Error as e:
            print("數據庫錯誤，緣由%d: %s" % (e.args[0], e.args[1]))
複製代碼

經過上面代碼咱們來看如何去重

咱們在每次插入以前須要查詢下數據是否已經存在，若是存在就不在插入，咱們的home_list表格的字段有 id，title,url,img_path。經過分析咱們抓取到的數據titlehe和img_path字段均可能爲空，因此這裏咱們經過url字段來去重。知道去重原理之後再去讀上面的代碼，你應該能容易理解了

三，查詢數據庫中最後一條數據的id值，來肯定咱們新數據id的開始值

經過下面的getLastId函數來獲取home_list表裏的最後一條數據的id值數據庫

# 查詢最後一條數據的id值
    def getLastId(self):
        sql = "SELECT max(id) FROM " + self.table
        try:
            self.cursor.execute(sql)
            row = self.cursor.fetchone()  # 獲取查詢到的第一條數據
            if row[0]:
                return row[0]  # 返回最後一條數據的id
            else:
                return 0  # 若是表格爲空就返回0
        except:
            print(sql + ' execute failed.')
複製代碼

下面貼出MySQLCommand數據庫操做類的完整代碼

# -*- coding: utf-8 -*-
# 做者微信：2501902696
import pymysql
# 用來操做數據庫的類
class MySQLCommand(object):
    # 類的初始化
    def __init__(self):
        self.host = 'localhost'
        self.port = 3306  # 端口號
        self.user = 'root'  # 用戶名
        self.password = ""  # 密碼
        self.db = "home"  # 庫
        self.table = "home_list"  # 表

    # 連接數據庫
    def connectMysql(self):
        try:
            self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
                                        passwd=self.password, db=self.db, charset='utf8')
            self.cursor = self.conn.cursor()
        except:
            print('connect mysql error.')

    # 插入數據，插入以前先查詢是否存在，若是存在就再也不插入
    def insertData(self, my_dict):
        table = "home_list"  # 要操做的表格
        # 注意，這裏查詢的sql語句url=' %s '中%s的先後要有空格
        sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
        res = self.cursor.execute(sqlExit)
        if res:  # res爲查詢到的數據條數若是大於0就表明數據已經存在
            print("數據已存在", res)
            return 0
        # 數據不存在才執行下面的插入操做
        try:
            cols = ', '.join(my_dict.keys())#用，分割
            values = '"," '.join(my_dict.values())
            sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
            #拼裝後的sql以下
            # INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
            try:
                result = self.cursor.execute(sql)
                insert_id = self.conn.insert_id()  # 插入成功後返回的id
                self.conn.commit()
                # 判斷是否執行成功
                if result:
                    print("插入成功", insert_id)
                    return insert_id + 1
            except pymysql.Error as e:
                # 發生錯誤時回滾
                self.conn.rollback()
                # 主鍵惟一，沒法插入
                if "key 'PRIMARY'" in e.args[1]:
                    print("數據已存在，未插入數據")
                else:
                    print("插入數據失敗，緣由 %d: %s" % (e.args[0], e.args[1]))
        except pymysql.Error as e:
            print("數據庫錯誤，緣由%d: %s" % (e.args[0], e.args[1]))

    # 查詢最後一條數據的id值
    def getLastId(self):
        sql = "SELECT max(id) FROM " + self.table
        try:
            self.cursor.execute(sql)
            row = self.cursor.fetchone()  # 獲取查詢到的第一條數據
            if row[0]:
                return row[0]  # 返回最後一條數據的id
            else:
                return 0  # 若是表格爲空就返回0
        except:
            print(sql + ' execute failed.')

    def closeMysql(self):
        self.cursor.close()
        self.conn.close()  # 建立數據庫操做類的實例
複製代碼

再貼出把爬蟲爬取數據插入到數據庫的代碼

# -*- coding: utf-8 -*-
# 做者微信：2501902696
from bs4 import BeautifulSoup
from urllib import request
import chardet

from db.MySQLCommand import MySQLCommand

url = "https://www.huxiu.com"
response = request.urlopen(url)
html = response.read()
charset = chardet.detect(html)
html = html.decode(str(charset["encoding"]))  # 設置抓取到的html的編碼方式

# 使用剖析器爲html.parser
soup = BeautifulSoup(html, 'html.parser')
# 獲取到每個class=hot-article-img的a節點
allList = soup.select('.hot-article-img')

# 鏈接數據庫
mysqlCommand = MySQLCommand()
mysqlCommand.connectMysql()
#這裏每次查詢數據庫中最後一條數據的id，新加的數據每成功插入一條id+1
dataCount = int(mysqlCommand.getLastId()) + 1
for news in allList:  # 遍歷列表，獲取有效信息
    aaa = news.select('a')
    # 只選擇長度大於0的結果
    if len(aaa) > 0:
        # 文章連接
        try:  # 若是拋出異常就表明爲空
            href = url + aaa[0]['href']
        except Exception:
            href = ''
        # 文章圖片url
        try:
            imgUrl = aaa[0].select('img')[0]['src']
        except Exception:
            imgUrl = ""
        # 新聞標題
        try:
            title = aaa[0]['title']
        except Exception:
            title = ""

        #把爬取到的每條數據組合成一個字典用於數據庫數據的插入
        news_dict = {
            "id": str(dataCount),
            "title": title,
            "url": href,
            "img_path": imgUrl
        }
        try:
            # 插入數據，若是已經存在就不在重複插入
            res = mysqlCommand.insertData(news_dict)
            if res:
                dataCount=res
        except Exception as e:
            print("插入數據失敗", str(e))#輸出插入失敗的報錯語句
mysqlCommand.closeMysql()  # 最後必定要要把數據關閉
dataCount=0
複製代碼