python爬蟲之鏈接mysql

時間 2019-11-09

標籤 python 爬蟲鏈接 mysql 欄目 Python 简体版

原文原文鏈接

準備工做

運行本地數據庫服務器

mysql -u root -p

安裝pymysql

pip install pymysql

建表

CREATE DATABASE crawls;
// show databases; 
use db;

CREATE TABLE IF NOT EXISTS baiduNews('
       'id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,'
       'ranking VARCHAR(30),'
       'title VARCHAR(60),'
       'datetime TIMESTAMP,'
       'hot VARCHAR(30));
// show tables;

pymysql鏈接數據庫

db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', 
                    db='crawls', charset='utf8')
cursor = db.cursor()
cursor.execute(sql_query)
db.commit()

用python操做mysql仍是比較簡單的，若是有一點數據庫基礎的話，能夠直接上手，最後必定不要忘了寫commit提交，否則數據只是緩存，存不到數據庫裏html

完整示例

爬取百度上最熱的幾個新聞標題，並存儲到數據庫，太懶了沒寫註釋-_- (確保本地mysql服務器已經打開）python

'''
Get the hottest news title on baidu page,
then save these data into mysql
'''
import datetime

import pymysql
from pyquery import PyQuery as pq
import requests
from requests.exceptions import ConnectionError

URL = 'https://www.baidu.com/s?wd=%E7%83%AD%E7%82%B9'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    'Upgrade-Insecure-Requests': '1'
}

def get_html(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError as e:
        print(e.args)
        return None

def parse_html(html):
    doc = pq(html)
    trs = doc('.FYB_RD table.c-table tr').items()
    for tr in trs:
        index = tr('td:nth-child(1) span.c-index').text()
        title = tr('td:nth-child(1) span a').text()
        hot = tr('td:nth-child(2)').text().strip('"')
        yield {
            'index':index,
            'title':title,
            'hot':hot
        }

def save_to_mysql(items):
    try:
        db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456',
                             db='crawls', charset='utf8')
        cursor = db.cursor()
        cursor.execute('use crawls;')
        cursor.execute('CREATE TABLE IF NOT EXISTS baiduNews('
                       'id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,'
                       'ranking VARCHAR(30),'
                       'title VARCHAR(60),'
                       'datetime TIMESTAMP,'
                       'hot VARCHAR(30));')
        try:
            for item in items:
                print(item)
                now = datetime.datetime.now()
                now = now.strftime('%Y-%m-%d %H:%M:%S')
                sql_query = 'INSERT INTO baiduNews(ranking, title, datetime, hot) VALUES ("%s", "%s", "%s", "%s")' % (
                            item['index'], item['title'], now, item['hot'])
                cursor.execute(sql_query)
                print('Save into mysql')
            db.commit()
        except pymysql.MySQLError as e:
            db.rollback()
            print(e.args)
            return
    except pymysql.MySQLError as e:
        print(e.args)
        return

def check_mysql():
    try:
        db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456',
                             db='crawls', charset='utf8')
        cursor = db.cursor()
        cursor.execute('use crawls;')
        sql_query = 'SELECT * FROM baiduNews'
        results = cursor.execute(sql_query)
        print(results)
    except pymysql.MySQLError as e:
        print(e.args)

def main():
    html = get_html(URL)
    items = parse_html(html)
    save_to_mysql(items)
    #check_mysql()

if __name__ == '__main__':
    main()