這是python3實戰入門系列的第三篇文章,要學習這一篇須要瞭解前兩篇,要不學起來比較費勁html
下面來正式開始把咱們第一節爬取到的新聞數據保存到mysql數據中python
經過定義一個MySQLCommand類來配置數據庫鏈接參數,並定義一個connectMysql方法鏈接數據庫mysql
# -*- coding: utf-8 -*-
# 做者微信:2501902696
import pymysql
# 用來操做數據庫的類
class MySQLCommand(object):
# 類的初始化
def __init__(self):
self.host = 'localhost'
self.port = 3306 # 端口號
self.user = 'root' # 用戶名
self.password = "" # 密碼
self.db = "home" # 庫
self.table = "home_list" # 表
# 連接數據庫
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
複製代碼
插入數據以前咱們有兩個問題sql
# 插入數據,插入以前先查詢是否存在,若是存在就再也不插入
def insertData(self, my_dict):
table = "home_list" # 要操做的表格
# 注意,這裏查詢的sql語句url=' %s '中%s的先後要有空格
sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
res = self.cursor.execute(sqlExit)
if res: # res爲查詢到的數據條數若是大於0就表明數據已經存在
print("數據已存在", res)
return 0
# 數據不存在才執行下面的插入操做
try:
cols = ', '.join(my_dict.keys())#用,分割
values = '"," '.join(my_dict.values())
sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
#拼裝後的sql以下
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # 插入成功後返回的id
self.conn.commit()
# 判斷是否執行成功
if result:
print("插入成功", insert_id)
return insert_id + 1
except pymysql.Error as e:
# 發生錯誤時回滾
self.conn.rollback()
# 主鍵惟一,沒法插入
if "key 'PRIMARY'" in e.args[1]:
print("數據已存在,未插入數據")
else:
print("插入數據失敗,緣由 %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("數據庫錯誤,緣由%d: %s" % (e.args[0], e.args[1]))
複製代碼
經過下面的getLastId函數來獲取home_list表裏的最後一條數據的id值數據庫
# 查詢最後一條數據的id值
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() # 獲取查詢到的第一條數據
if row[0]:
return row[0] # 返回最後一條數據的id
else:
return 0 # 若是表格爲空就返回0
except:
print(sql + ' execute failed.')
複製代碼
# -*- coding: utf-8 -*-
# 做者微信:2501902696
import pymysql
# 用來操做數據庫的類
class MySQLCommand(object):
# 類的初始化
def __init__(self):
self.host = 'localhost'
self.port = 3306 # 端口號
self.user = 'root' # 用戶名
self.password = "" # 密碼
self.db = "home" # 庫
self.table = "home_list" # 表
# 連接數據庫
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
# 插入數據,插入以前先查詢是否存在,若是存在就再也不插入
def insertData(self, my_dict):
table = "home_list" # 要操做的表格
# 注意,這裏查詢的sql語句url=' %s '中%s的先後要有空格
sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
res = self.cursor.execute(sqlExit)
if res: # res爲查詢到的數據條數若是大於0就表明數據已經存在
print("數據已存在", res)
return 0
# 數據不存在才執行下面的插入操做
try:
cols = ', '.join(my_dict.keys())#用,分割
values = '"," '.join(my_dict.values())
sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
#拼裝後的sql以下
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # 插入成功後返回的id
self.conn.commit()
# 判斷是否執行成功
if result:
print("插入成功", insert_id)
return insert_id + 1
except pymysql.Error as e:
# 發生錯誤時回滾
self.conn.rollback()
# 主鍵惟一,沒法插入
if "key 'PRIMARY'" in e.args[1]:
print("數據已存在,未插入數據")
else:
print("插入數據失敗,緣由 %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("數據庫錯誤,緣由%d: %s" % (e.args[0], e.args[1]))
# 查詢最後一條數據的id值
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() # 獲取查詢到的第一條數據
if row[0]:
return row[0] # 返回最後一條數據的id
else:
return 0 # 若是表格爲空就返回0
except:
print(sql + ' execute failed.')
def closeMysql(self):
self.cursor.close()
self.conn.close() # 建立數據庫操做類的實例
複製代碼
# -*- coding: utf-8 -*-
# 做者微信:2501902696
from bs4 import BeautifulSoup
from urllib import request
import chardet
from db.MySQLCommand import MySQLCommand
url = "https://www.huxiu.com"
response = request.urlopen(url)
html = response.read()
charset = chardet.detect(html)
html = html.decode(str(charset["encoding"])) # 設置抓取到的html的編碼方式
# 使用剖析器爲html.parser
soup = BeautifulSoup(html, 'html.parser')
# 獲取到每個class=hot-article-img的a節點
allList = soup.select('.hot-article-img')
# 鏈接數據庫
mysqlCommand = MySQLCommand()
mysqlCommand.connectMysql()
#這裏每次查詢數據庫中最後一條數據的id,新加的數據每成功插入一條id+1
dataCount = int(mysqlCommand.getLastId()) + 1
for news in allList: # 遍歷列表,獲取有效信息
aaa = news.select('a')
# 只選擇長度大於0的結果
if len(aaa) > 0:
# 文章連接
try: # 若是拋出異常就表明爲空
href = url + aaa[0]['href']
except Exception:
href = ''
# 文章圖片url
try:
imgUrl = aaa[0].select('img')[0]['src']
except Exception:
imgUrl = ""
# 新聞標題
try:
title = aaa[0]['title']
except Exception:
title = ""
#把爬取到的每條數據組合成一個字典用於數據庫數據的插入
news_dict = {
"id": str(dataCount),
"title": title,
"url": href,
"img_path": imgUrl
}
try:
# 插入數據,若是已經存在就不在重複插入
res = mysqlCommand.insertData(news_dict)
if res:
dataCount=res
except Exception as e:
print("插入數據失敗", str(e))#輸出插入失敗的報錯語句
mysqlCommand.closeMysql() # 最後必定要要把數據關閉
dataCount=0
複製代碼
若是對上面代碼不是很瞭解能夠到個人第一節文章去看下 python3實戰入門python爬蟲篇---網頁爬蟲,圖片爬蟲,文章爬蟲,Python爬蟲爬取新聞網站新聞網頁爬蟲
到此咱們的python3爬蟲+python3數據庫篇就完事了,看下操做效果圖bash
寫於---Python零基礎實戰入門第四天微信