Python爬蟲根據關鍵詞爬取知網論文摘要並保存到數據庫中【入門必學】

時間 2020-05-08

標籤 python 爬蟲根據關鍵詞論文摘要保存數據庫入門必學欄目 Python 简体版

原文原文鏈接

前言

本文的文字及圖片來源於網絡,僅供學習、交流使用,不具備任何商業用途,版權歸原做者全部,若有問題請及時聯繫咱們以做處理。

html

做者：崩壞的芝麻mysql

因爲實驗室須要一些語料作研究，語料要求是知網上的論文摘要，可是目前最新版的知網爬起來有些麻煩，因此我利用的是知網的另一個搜索接口

好比下面這個網頁：
http://search.cnki.net/Search.aspx?q=肉製品web

搜索出來的結果和知網上的結果幾乎同樣，另外之後面試找Python工做，項目經驗展現是核心，若是你缺項目練習，去小編的Python交流.裙：一久武其而而流一思（數字的諧音）轉換下能夠找到了，裏面不少新教程項目面試

在這個基礎上，我簡單看了些網頁的結構，很容易就能寫出爬取得代碼（是最基礎的，至關不完善，增長其餘功能可自行增長）

網頁的結構仍是很清晰的sql

摘要信息也很清晰數據庫

我使用的是 pymysql 鏈接的數據庫，效率也還能夠
下面直接貼代碼：markdown

# -*- coding: utf-8 -*-
import time
import re
import random
import requests
from bs4 import BeautifulSoup
import pymysql

connection = pymysql.connect(host='',
                             user='',
                             password='',
                             db='',
                             port=3306,
                             charset='utf8')  # 注意是utf8不是utf-8

# 獲取遊標
cursor = connection.cursor()

#url = 'http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=RLGY201806014&dbname=CJFDLAST2018'

#這個headers信息必須包含，不然該網站會將你的請求重定向到其它頁面
headers = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Connection':'keep-alive',
    'Host':'www.cnki.net',
    'Referer':'http://search.cnki.net/search.aspx?q=%E4%BD%9C%E8%80%85%E5%8D%95%E4%BD%8D%3a%E6%AD%A6%E6%B1%89%E5%A4%A7%E5%AD%A6&rank=relevant&cluster=zyk&val=CDFDTOTAL',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

headers1 = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

def get_url_list(start_url):
    depth = 20
    url_list = []
    for i in range(depth):
        try:
            url = start_url + "&p=" + str(i * 15)
            search = requests.get(url.replace('\n', ''), headers=headers1)
            soup = BeautifulSoup(search.text, 'html.parser')
            for art in soup.find_all('div', class_='wz_tab'):
                print(art.find('a')['href'])
                if art.find('a')['href'] not in url_list:
                    url_list.append(art.find('a')['href'])
            print("爬取第" + str(i) + "頁成功！")
            time.sleep(random.randint(1, 3))
        except:
            print("爬取第" + str(i) + "頁失敗！")
    return url_list

def get_data(url_list, wordType):
    try:
        # 經過url_results.txt讀取連接進行訪問
        for url in url_list:
            i = 1;
            if url == pymysql.NULL or url == '':
                continue
            try:
                html = requests.get(url.replace('\n', ''), headers=headers)
                soup = BeautifulSoup(html.text, 'html.parser')
            except:
                print("獲取網頁失敗")
            try:
                print(url)
                if soup is None:
                    continue
                # 獲取標題
                title = soup.find('title').get_text().split('-')[0]
                # 獲取做者
                author = ''
                for a in soup.find('div', class_='summary pad10').find('p').find_all('a', class_='KnowledgeNetLink'):
                    author += (a.get_text() + ' ')
                # 獲取摘要
                abstract = soup.find('span', id='ChDivSummary').get_text()
                # 獲取關鍵詞，存在沒有關鍵詞的狀況
            except:
                print("部分獲取失敗")
                pass
            try:
                key = ''
                for k in soup.find('span', id='ChDivKeyWord').find_all('a', class_='KnowledgeNetLink'):
                    key += (k.get_text() + ' ')
            except:
                pass
            print("第" + str(i) + "個url")
            print("【Title】：" + title)
            print("【author】：" + author)
            print("【abstract】：" + abstract)
            print("【key】：" + key)
            # 執行SQL語句
            cursor.execute('INSERT INTO cnki VALUES (NULL, %s, %s, %s, %s, %s)', (wordType, title, author, abstract, key))
            # 提交到數據庫執行
            connection.commit()

            print()
        print("爬取完畢")
    finally:
        print()

if __name__ == '__main__':
    try:
        for wordType in {"大腸桿菌", "菌羣總落", "胭脂紅", "日落黃"}:
            wordType = "肉+" + wordType
            start_url = "http://search.cnki.net/search.aspx?q=%s&rank=relevant&cluster=zyk&val=" % wordType
            url_list = get_url_list(start_url)
            print("開始爬取")
            get_data(url_list, wordType)
            print("一種類型爬取完畢")
        print("所有爬取完畢")
    finally:
        connection.close()

在這裏的關鍵詞我簡單的選了幾個，做爲實驗，若是爬取的不少，能夠寫在txt文件裏，直接讀取就能夠，很是方便。網絡