=====================爬蟲原理=====================html
經過Python訪問新聞首頁,獲取首頁全部新聞連接,並存放至URL集合中。node
逐一取出集合中的URL,並訪問連接獲取源碼,解析出新的URL連接添加到集合中。python
爲防止重複訪問,設置一個歷史訪問,用於對新添加的URL進行過濾。mysql
解析DOM樹,獲取文章相關信息,並將信息存儲到Article對象中。sql
將Article對象中的數據經過pymysql保存到數據庫中。數據庫
每完成一次數據的存儲,計數器增長並打印文章標題,不然打印錯誤信息。網絡
若是集合中的URL所有讀取完或數據數量達到設定值,程序結束。ide
=====================存儲結構=====================url
CREATE TABLE `news` ( `id` int(6) unsigned NOT NULL AUTO_INCREMENT, `url` varchar(255) NOT NULL, `title` varchar(45) NOT NULL, `author` varchar(12) NOT NULL, `date` varchar(12) NOT NULL, `about` varchar(255) NOT NULL, `content` text NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `url_UNIQUE` (`url`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
=====================腳本代碼=====================spa
''' 百度百家新聞收集 ''' import re # 網絡鏈接模塊 import bs4 # DOM解析模塊 import pymysql # 數據庫鏈接模塊 import urllib.request # 網絡訪問模塊 # 配置參數 maxcount = 1000 # 數據數量 home = 'http://baijia.baidu.com/' # 起始位置 # 數據庫鏈接參數 db_config = { 'host': 'localhost', 'port': '3310', 'username': 'woider', 'password': '3243', 'database': 'python', 'charset': 'utf8' } url_set = set() # url集合 url_old = set() # 過時url # 獲取首頁連接 html = urllib.request.urlopen(home).read().decode('utf8') soup = bs4.BeautifulSoup(html, 'html.parser') pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+' links = soup.find_all('a', href=re.compile(pattern)) for link in links: url_set.add(link['href']) # 文章類定義 class Article(object): def __init__(self): self.url = None self.title = None self.author = None self.date = None self.about = None self.content = None # 鏈接數據庫 connect = pymysql.Connect( host=db_config['host'], port=int(db_config['port']), user=db_config['username'], passwd=db_config['password'], db=db_config['database'], charset=db_config['charset'] ) cursor = connect.cursor() # 處理URL信息 count = 0 while len(url_set) != 0: try: # 獲取連接 url = url_set.pop() url_old.add(url) # 獲取代碼 html = urllib.request.urlopen(url).read().decode('utf8') # DOM解析 soup = bs4.BeautifulSoup(html, 'html.parser') pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+' # 連接匹配規則 links = soup.find_all('a', href=re.compile(pattern)) # 獲取URL for link in links: if link['href'] not in url_old: url_set.add(link['href']) # 數據防重 sql = "SELECT id FROM news WHERE url = '%s' " data = (url,) cursor.execute(sql % data) if cursor.rowcount != 0: raise Exception('Data Repeat Exception: ' + url) # 獲取信息 article = Article() article.url = url # URL信息 page = soup.find('div', {'id': 'page'}) article.title = page.find('h1').get_text() # 標題信息 info = page.find('div', {'class': 'article-info'}) article.author = info.find('a', {'class': 'name'}).get_text() # 做者信息 article.date = info.find('span', {'class': 'time'}).get_text() # 日期信息 article.about = page.find('blockquote').get_text() pnode = page.find('div', {'class': 'article-detail'}).find_all('p') article.content = '' for node in pnode: # 獲取文章段落 article.content += node.get_text() + '\n' # 追加段落信息 # 存儲數據 sql = "INSERT INTO news( url, title, author, date, about, content ) " sql = sql + " VALUES ('%s', '%s', '%s', '%s', '%s', '%s') " data = (article.url, article.title, article.author, article.date, article.about, article.content) cursor.execute(sql % data) connect.commit() except Exception as e: print(e) continue else: print(article.title) count += 1 finally: # 判斷數據是否收集完成 if count == maxcount: break # 關閉數據庫鏈接 cursor.close() connect.close()
=====================運行結果=====================
設置參數 maxcount = 10 , home = 'http://baijia.baidu.com/'
查詢數據 SELECT title, author FROM python.news;