豆瓣相信不少人都爬過,我也把個人方法拿出來交流學習,我也是菜鳥過來的,不會省略代碼,此教程純屬娛樂,大神勿噴。python
# encoding:UTF-8 import re import requests import MySQLdb from bs4 import BeautifulSoup headers = {'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} def getPage(get_url): r=requests.get(get_url) response = r.text return response def filterpage(): pageCode = getPage(get_url) pattern = re.compile('<em class="">(.*?)</em>.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>',re.S) items = re.findall(pattern,pageCode) pageStories = [] for item in items: pageStories.append([item[0].strip(),item[1].strip(),item[2].strip()]) return pageStories def save_data(): Dbdata=filterpage() db=MySQLdb.connect(host='localhost',user='root',passwd='******',db='movie',charset='utf8') cursor=db.cursor() for a in Dbdata: id=a[0] name=a[1] grade=a[2] #comment=a[3] value=[id,name,grade] cursor.execute('insert into movie_info values(%s,%s,%s)',value) db.commit() def main(): pageStories=filterpage() #print pageStories for story in pageStories: try: print u"序號:",story[0],u"電影名:",story[1], u"\t評分:", story[2] except: pass if __name__ == '__main__': get_url = 'https://movie.douban.com/top250/' i=1 while i < 11: main() save_data() print u'頁碼',i num = i * 25 get_url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter=' i = i + 1
[mysqld] tmpdir=F:\mysql-5.7.18-winx64 # 安裝目錄 datadir=F:\mysql-5.7.18-winx64\data # data目錄,沒有data目錄請手動建立 early-plugin-load="" skip-grant-tables
cd F:\mysql-5.7.18-winx64\bin # 進入bin目錄,否則有可能會報錯 mysqld initialize # 初始化命令 net start mysql # 啓動服務
mysql>create database movie # 建立movie數據庫 mysql>use movie # mysql>create table movie_info (id varchar(100),name varchar(100),grade decimal(3,1));
這個教程中,咱們須要爬取豆瓣top250的電影名、序號和相應的評分。F12分析網頁結構。找到有《肖申克的救贖》的代碼處:mysql
<em class="">1</em> # 序號 . . . <span class="title">肖申克的救贖</span> # 電影名 . . . <span class="rating_num" property="v:average">9.6</span> # 評分
ok,日後翻,能夠看到剩下的都是這類的結構,咱們找到規律了,就能夠想辦法,把裏面的東西給搞出來。
通常經常使用的有re模塊和beautifulsoup模塊,我是用re模塊正則匹配,感受正則更強大一點,漂亮湯模塊用起來簡單,可是速度慢。git
def getPage(get_url): # 定義一個抓取網頁的方法 r=requests.get(get_url) # get到目標網頁 response = r.text return response # 返回抓取到的網頁
def filterpage(): # 頁面處理方法 pageCode = getPage(get_url) #傳參 pattern = re.compile('<em class="">(.*?)</em>.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>',re.S) items = re.findall(pattern,pageCode) pageStories = [] # 定義一個空數組,用來存分離出來的數據 for item in items: # 迭代 pageStories.append([item[0].strip(),item[1].strip(),item[2].strip()])# 去除空格 return pageStories
咱們用第一個(.*?)非貪婪匹配來匹配序號,接着用.*?貪婪匹配匹配不須要的代碼,日後一樣用非貪婪匹配來匹配咱們須要的信息。github
def save_data(): Dbdata=filterpage() #傳參 db=MySQLdb.connect(host='localhost',user='root',passwd='suyu.123',db='movie',charset='utf8') # 連接數據庫 cursor=db.cursor() # 定義遊標 for a in Dbdata: # 迭代存儲數據 id=a[0] name=a[1] grade=a[2] value=[id,name,grade] cursor.execute('insert into movie_info values(%s,%s,%s)',value) # sql命令存數據 db.commit() # 別忘記提交變動
def main(): pageStories=filterpage() # 傳參 for story in pageStories: # 迭代打印 try: print u"序號:",story[0],u"電影名:",story[1], u"\t評分:", story[2] except: pass
if __name__ == '__main__': get_url = 'https://movie.douban.com/top250/' # 起始網頁 i=1 while i < 11: main() # 運行主函數 save_data() # 運行存儲函數 print u'頁碼',i # num = i * 25 get_url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter=' i = i + 1