準備:html
1.扒網頁,根據URL來獲取網頁信息正則表達式
import urllib.parse import urllib.request response = urllib.request.urlopen("https://www.cnblogs.com") print(response.read())
urlopen方法mvc
1app |
|
url即爲URL,data是訪問URL時要傳送的數據,timeout是設置超時時間 函數
返回response對象工具
response對象的read方法,能夠返回獲取到的網頁內容post
POST方式ui
import urllib.parse import urllib.request values = {"username":"XXX","password":"XXX"} data = urllib.parse.urlencode(values) data = data.encode('utf-8') url = "https://passport.cnblogs.com/user/signin?ReturnUrl=https://home.cnblogs.com/&AspxAutoDetectCookieSupport=1" response = urllib.request.urlopen(url,data) print(response.read())
GET方式url
import urllib.parse import urllib.request values = {"itemCount":30} data = urllib.parse.urlencode(values) data = data.encode('utf-8') url = "https://news.cnblogs.com/CommentAjax/GetSideComments" data = urllib.parse.urlencode(values) response = urllib.request.urlopen(url+'?'+data) print(response.read())
2.正則表達式re模塊
Python 自帶了re模塊,提供了對正則表達式的支持
1 2 3 4 5 6 7 8 9 10 |
|
3.Beautiful Soup,是從網頁抓取數據的庫,使用時須要導入 bs4 庫
4.MongoDB
使用的MongoEngine庫
示例:
抓取博客園前20頁數據,保存到MongoDB中
1.獲取博客園的數據
request.py
import urllib.parse import urllib.request def getHtml(url,values): data = urllib.parse.urlencode(values) response_result = urllib.request.urlopen(url+'?'+data).read() html = response_result.decode('utf-8') return html def requestCnblogs(num): print('請求數據page:',num) url = 'https://www.cnblogs.com/mvc/AggSite/PostList.aspx' values= { 'CategoryId':808, 'CategoryType' : 'SiteHome', 'ItemListActionName' :'PostList', 'PageIndex' : num, 'ParentCategoryId' : 0, 'TotalPostCount' : 4000 } result = getHtml(url,values) return result
注:
打開第二頁,f12,找到https://www.cnblogs.com/mvc/AggSite/PostList.aspx
2.解析獲取來的數據
deal.py
from bs4 import BeautifulSoup import request import re def blogParser(index): cnblogs = request.requestCnblogs(index) soup = BeautifulSoup(cnblogs, 'html.parser') all_div = soup.find_all('div', attrs={'class': 'post_item_body'}, limit=20) blogs = [] #循環div獲取詳細信息 for item in all_div: blog = analyzeBlog(item) blogs.append(blog) return blogs def analyzeBlog(item): result = {} a_title = find_all(item,'a','titlelnk') if a_title is not None: result["title"] = a_title[0].string result["link"] = a_title[0]['href'] p_summary = find_all(item,'p','post_item_summary') if p_summary is not None: result["summary"] = p_summary[0].text footers = find_all(item,'div','post_item_foot') footer = footers[0] result["author"] = footer.a.string str = footer.text time = re.findall(r"發佈於 .+? .+? ", str) result["create_time"] = time[0].replace('發佈於 ','') return result def find_all(item,attr,c): return item.find_all(attr,attrs={'class':c},limit=1)
注:
分析html結構
3.將處理好的數據保存到MongoDB
db.py
from mongoengine import * connect('test', host='localhost', port=27017) import datetime class Blogs(Document): title = StringField(required=True, max_length=200) link = StringField(required=True) author = StringField(required=True) summary = StringField(required=True) create_time = StringField(required=True) def savetomongo(contents): for content in contents: blog = Blogs( title=content['title'], link= content['link'], author=content['author'], summary=content['summary'], create_time=content['create_time'] ) blog.save() return "ok" def haveBlogs(): blogs = Blogs.objects.all() return len(blogs)
4.開始抓取數據
test.py
import db import deal print("start.......") for i in range(1, 21): contents = deal.blogParser(i) db.savetomongo(contents) print('page',i,' OK.') counts = db.haveBlogs() print("have ",counts," blogs") print("end.......")
注:
當前使用的Python版本是3.6.1
能夠在可視化工具中查看(但是化工具 介紹 )