有90%的有用數據,都在數據庫中。javascript
數據類型
定量數據
數據來源
XML
,JSON
。非結構化數據必須透過ETL(Extract抽取, Transfromation轉換, Loading儲存)
工具將數據轉爲結構化數據才能取用。html
普通操做文件
with open('fliename', 'raw') as f: f.write('hello world') f.read() f.readlines()
CSV格式數據
方式一:
經過文件打開讀取數據。java
with open('./Population.csv', 'r', encoding='UTF-8') as f: # print(f.read()) for line in f.readlines(): print(line)
方式二:
經過pandas
模塊讀取python
import pandas as pd df = pd.read_csv('./Population.csv') print(df.values)
Excel格式數據
import pandas as pd filename = 'house_sample.xlsx' df = pd.read_excel(filename) print(df.values[0][0])
JSON格式數據
方式1:
經過文件讀取,而後json
模塊讀取,轉換爲list
類型數據。sql
import json filename = 'jd.json' with open(filename, 'r') as f: fc = f.read() df = json.loads(fc) print(df) strjson = json.dumps(df)
方式2:
經過pandas
模塊讀取數據庫
import pandas as pd filename = 'jd.json' df = pd.read_json(filename) print(df.values)
XML格式數據
經過模塊xml
處理:json
import xml.etree.ElementTree as ET filename = 'weather.xml' tree = ET.parse(filename) root = tree.getroot() for city in root.iter('city'): print(city.get('cityname'))
須要模塊:數組
BeautifulSoup
request
:網絡獲取,可使用REST
操做POST,PUT,GET,DELETE
存取網絡資源.簡單爬取:cookie
import requests newurl = 'http://news.qq.com/' res = requests.get(newurl) print(res.text)
BeautifulSoup
bs4
模塊,能夠把抓取的網頁變成DOM
文檔,容許使用CSS選擇器
來尋找須要的內容。網絡
import requests from bs4 import BeautifulSoup newurl = 'http://news.qq.com/' res = requests.get(newurl) html = res.text # print(res.text) html = '\ <h1 title="123" id="h">hello world</h1>\ <h2>數據科學</h2> ' soup = BeautifulSoup(html, 'html.parser') s = soup.select('h1') # 獲取元素 print(s[0]['title']) # 獲取屬性
抓取位置實用工具
lxml
庫從其它地方獲取到數據,存儲爲.json
, .cvs
, .xlsx
,須要從DataFrame()
中獲取。
import pandas import requests from bs4 import BeautifulSoup newurl = 'http://news.qq.com/' html = requests.get(newurl).text soup = BeautifulSoup(html, 'html.parser') warp = soup.select('.head .Q-tpWrap .text') dataArr = [] for news in warp: dataArr.append({'name': news.select('a')[0].text.encode(), 'herf': news.select('a')[0]['href']}) newsdf = pandas.DataFrame(dataArr) newsdf.to_json('news.json') newsdf.to_csv('news.csv') newsdf.to_excel('news.xlsx')
import requests from bs4 import BeautifulSoup import json url = 'http://xm.esf.fang.com/' html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') resultArr = [] for house in soup.select('.shop_list dl'): shop = { 'tit_shop': house.select('dd:nth-of-type(1) .tit_shop') and house.select('dd:nth-of-type(1) .tit_shop')[0].text, 'tel_shop': house.select('dd:nth-of-type(1) .tel_shop') and ''.join( house.select('dd:nth-of-type(1) .tel_shop')[0].text.split('|') ).strip(), 'add_shop': house.select('dd:nth-of-type(1) .add_shop') and '小區名字:' + house.select('dd:nth-of-type(1) .add_shop')[0].select('a')[0].text + '; 具體地址:' + house.select('dd:nth-of-type(1) .add_shop')[0].select('span')[0].text, 'price_shop': house.select('dd:nth-of-type(2) span b') and house.select('dd:nth-of-type(2) span b')[0].text, 'sqm': house.select('dd:nth-of-type(2) span') and house.select('dd:nth-of-type(2) span')[1].text } resultArr.append(shop) resultArr = json.dumps(resultArr) with open('fang.json', 'w') as f: f.write(resultArr) print('ok')
爬取房天下的廈門二手房數據
import json import requests from bs4 import BeautifulSoup url = 'http://xm.esf.fang.com/' html = requests.get(url).text domain = 'http://xm.esf.fang.com' def getUrlDetails(url): dhtml = requests.get(url).text dsoup = BeautifulSoup(dhtml, 'html.parser') info = {} info['標題'] = dsoup.select('.title h1')[0] and dsoup.select( '.title h1')[0].text.strip() info['總價'] = dsoup.select('.tab-cont-right .price_esf')[0].text for item in dsoup.select('.tab-cont-right .trl-item1'): info[item.select('.font14')[0].text] = item.select( '.tt')[0].text.strip() info['地址'] = dsoup.select( '.tab-cont-right .trl-item2 .rcont')[0].text.strip()[0:-2] for item in dsoup.select('.zf_new_left .cont .text-item'): st_split = item.text.strip().split('\n') while '' in st_split: st_split.remove('') while '\r' in st_split: st_split.remove('\r') if len(st_split) > 2: st_split = [st_split[0]] + [''.join(st_split[1:])] k, v = st_split info[k] = v.strip() return info if __name__ == '__main__': soup = BeautifulSoup(html, 'html.parser') resultArr = [] for house in soup.select('.shop_list dl'): if house.select('h4 a'): resUrl = domain + house.select('h4 a')[0]['href'] if getUrlDetails(resUrl): resultArr.append(getUrlDetails(resUrl)) result = json.dumps(resultArr) print('爬取完畢') with open('house.json', 'w') as f: f.write(result) print('寫入完畢')
爬取拉勾網招聘信息 json格式
# coding=utf-8 import json import time import requests import xlwt url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' # 獲取存儲職位信息的json對象,遍歷得到公司名、福利待遇、工做地點、學歷要求、工做類型、發佈時間、職位名稱、薪資、工做年限 def getJobRow(url, datas): time.sleep(10) header = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_?labelWords=&fromSearch=true&suginput=' } cookie = { 'Cookie': 'JSESSIONID=ABAAABAAAIAACBI80DD5F5ACDEA0EB9CA0A1B926B8EAD3C; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1539844668; _ga=GA1.2.439735849.1539844668; _gid=GA1.2.491577759.1539844668; user_trace_token=20181018143747-53713f4a-d2a0-11e8-814e-525400f775ce; LGSID=20181018143747-53714082-d2a0-11e8-814e-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20181018143747-53714251-d2a0-11e8-814e-525400f775ce; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1539844675; LGRID=20181018143754-57b25578-d2a0-11e8-bdc4-5254005c3644; SEARCH_ID=d0a97cea1d1d47d0afa41b3f298f41d5' } result_data = requests.post(url=url, cookies=cookie, headers=header, data=datas).json() content = result_data['content']['positionResult']['result'] info_item = [] for i in content: information = [] information.append(i['positionId']) # 崗位對應ID information.append(i['companyFullName']) # 公司全名 information.append(i['companyLabelList']) # 福利待遇 information.append(i['district']) # 工做地點 information.append(i['education']) # 學歷要求 information.append(i['firstType']) # 工做類型 information.append(i['formatCreateTime']) # 發佈時間 information.append(i['positionName']) # 職位名稱 information.append(i['salary']) # 薪資 information.append(i['workYear']) # 工做年限 info_item.append(information) return info_item def main(): city = input('請輸入爬取的城市:') page = int(input('請輸入爬取的頁數:')) kd = input('請輸入爬取關鍵詞:') info_result = [] title = ['崗位id', '公司全名', '福利待遇', '工做地點', '學歷要求', '工做類型', '發佈時間', '職位名稱', '薪資', '工做年限'] info_result.append(title) for x in range(1, page+1): datas = { 'first': True, 'pn': x, 'kd': kd, 'city': city } info = getJobRow(url, datas) info_result = info_result + info print(info_result, 'info_result') # 寫入excel的數據格式組裝成: [[表頭數據], [row數據], [row], [row]] workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('lagou' + kd, cell_overwrite_ok=True) for i, row in enumerate(info_result): # print row for j, col in enumerate(row): # print col worksheet.write(i, j, col) # x,y 位置, col 內容 workbook.save('lagou' + kd + city + '.xls') if __name__ == '__main__': main()
數據清理
真正能用在數據分析的時間不多,必需要可以善用工具。
資料分析:
pandas
處理資料:
Table-like
格式,pandas
底層是numpy
numpy
特色:
C/C++
和Fortran
使用numpy
產生結構化信息,有缺陷,而在numpy
上完善的pandas
,比較合理使用數據。
pandas
增長序列Series結構:
Array
,List
的一維物件Series
均可以透過其索引進行存取Series
會以0到Series
長度做爲索引編號資料篩選
存取元素與切割:
df.ix[1] # 取一條記錄 df.ix[1:4] # 取1~4條記錄 df['name'] # 經過字段取數據 df[['name', 'age']] # 經過多個字段取數據,獲取每條字段下的數據 df[1:2, ['name', 'age']] # 根據索引號與字段名篩選數據 df['gender'] == 'M' # 根據enum特色的值判斷篩選數據,返回True 和 False df[df['gender'] == 'M'] # 根據enum特色的值判斷篩選數據, 整張表中符合的返回 df[(df['gender' == 'M']) & (df['age' >= 30])] # 使用 & 取條件交集 df[(df['gender' == 'M']) | (df['age' >= 30])] # 使用 | 取條件 df['employee'] = True # 新增字段 del df['employee'] # 刪除字段 df = df.drop('empyloyee', axis=1) # 刪除字段 df.loc[6] = {'age': 18, 'gender': 'F', 'name': 'aa'} # 新增一條記錄 df.append(pd.DataFrame([{'age': 18, 'gender': 'F', 'name': 'aa'}]), ignore_index=True) # 新增記錄 df = df.drop(6) # 刪除某條記錄 df['userid'] = range(101, 117) # 設定新的索引 df.set_index('userid', inplace=True) # 設定新索引 df.iloc[1] # 設定新的索引去獲取數據 df.iloc[[1:3]] # 設定新的索引去獲取數據
獲取值的三種方式
df.ix[[101, 102]] # 使用ix取值,useid df.loc[[101, 105]] # 使用loc取值,useid df.iloc[1, 2] # 使用iloc取值,索引
偵測缺失值
佔位:
使用numpy
中的numpy.nan
佔位表示缺失值
pd.DataFrame(['a', numpy.nan])
檢查序列是否有缺失值:
df['gender'].notnull() # 檢查非缺失值數據 df['gender'].isnull() # 檢查缺失值數據
檢查字段或Data Frame
是否含有缺失值:
df.name.isnull().values.any() # 檢查字段是否含有缺失值 df.isnull().values.any() # 檢查DataFrame是否含有缺失值,返回True或False
計算缺失值數量:
df.isnull().sum() # 檢查字段缺失值的數量 df.isnull().sum().sum() # 計算全部缺失值的數量
補齊缺失值
捨棄缺失值:
df.dropna() # 捨棄含有任意缺失值的行 df.dropna(how='all') # 捨棄全部都含有缺失值的行,每一個字段都是NaN df.dropna(thresh=2) # 捨棄超過兩欄缺失值的行 df['age'] = numpy.nan # 增長一列包含缺失值 df.dropna(axis=1, how='all') # 捨棄皆爲缺失值的列
填補缺失值:
df.fillna(0) # 用0填補缺失值 df['age'].fillna(df['age'].mean()) # 用平均數填補缺失值 df['age'].fillna(df.groupby('gender')['age'].transfrom('mean')) # 用各性別年齡平均填補缺失值 df.fillna(method='pad') # 向後填充缺失值 df.fillna(method='bfill', limit=2) # 向前填充缺失值
維護處理不須要數據或者特殊的數爲np.nan
:
df.loc[df['物業費用'] == '暫無資料', '物業費用'] = np.nan # 修改「暫無資料」爲"np.nan"
查看前三行數據:df.head(3)
查看後三行數據:df.tail(3)
查看DataFrame
信息: df.info()
查看字段名稱: df.columns
查看字段類型:df.dtypes
敘述性統計:df.describe()
檢查缺失值: df.isnull().any()
缺失值統計: df.isnull().sum()
缺失值在總體數據中的比例:df.isnull().sum() / df.count()
對特殊字段進行篩選處理: df['volume'].value_counts()
缺失值補齊:df['volume'].fillna(0)
資料轉換
如何清洗,轉換該數據? 使用向量化計算
計算新值:
df['總價'] * 10000 # 計算新價格
使用物件計算新價格:
import numpy as np np.sqrt(df['總價'])
合併二個字段:
df['朝向'] + df['戶型']
計算須要的新值:
df['均價'] = df['總價'] * 1000 / df['建築面積']
map
: 將函數套用到字段(Series
)上的每一個元素
def removeDollar(e): return e.split('萬')[0] df['總價'].map(removeDollar) # 移除「總價」字段中含有的"萬"字符 df['總價'].map(lamdba e: e.split('萬')[0]) # lamdba的寫法
apply
: 將函數套用到DataFrame
上的行或列
df.apply(lambda e: e.max() - e.min(), axis=1) # axis=0(列)axis=1(行) 根據行仍是列
applyMap
: 將函數套用到DataFrame
上的每一個元素
import numpy as np df.applymap(lamdba e: np.nan if e == '暫無資料' else e) # 將全部暫無資料的元素替代成缺失值(NaN) ''' lamdba e: np.nan if e == '暫無資料' else e def convertNaN(e): if e == '暫無資料': return np.nan else: return e '''
處理時間格式
如今時間:
from datetime import datetime current_time = datetime.now()
將時間轉換成字符串:
current_time.strftime('%Y-%m-%d')
將字符串轉爲時間:
datetime.strptime('2018-08-17', '%Y-%m-%d')
往前回溯一天:
from datetime import timedelta current_time - timedelta(day=1)
往前回溯十天:
from datetime import timedelta for i in range(1, 10): dt = current_time - timedelta(days=i) print(dt.strftime('%Y-%m-%d')) # 取得多天的日期 current_time - timedelta(day=10)
將datetime
轉換爲UNIX timestamp
:
from time import mktime mktime(current_time.timetuple()) # 需轉tuple
將UNIX timestamp
轉換爲datetime
:
datetime.fromtimestamp(1538202783)
在pandas
轉換時間:
import pandas as pd df['日期'] = pd.to_datetime(df['日期'], format='%Y年%m月%d日') # 默認轉換爲`-` 2018-9-29
資料重塑
建立虛擬變量:
pandas.get_dummies(df['朝向']) # 創建虛擬變量 df = pandas.concat([df, pandas.get_dummies(df['朝向'])], axis=1) # 合併虛擬變量與原DataFrame,成爲數據中的真實數據 df.drop(df['朝向'], axis=1) # 捨棄原有字段
創建透視表pivot_table
:
df2 = df.pivot_table(index='單價', columns='產權年限', values='參考均價', aggfunc=sum) df2.head() # index,列名字,columns,字段名,values,函數執行後的數據 # 可使用 df2.T 能夠透視錶行列轉換 df3 = df.pivot_table(index=['產權性質', '產權年限'], columns='日期', values='總價', aggfunc=sum) df3.head()
把數據經過正則出來。
[]
或*?, +?, ??
非貪婪模式(儘量少的對比)經過字段名獲取捕獲到的數據
m = re.match(r'(?P<first_name>\w+) (?P<last_name>\w+)', 'David Chiu') print(m.group('first_name'), m.group('last_name'))
str1 = 'scp file.text root@10.0.0.1:./' m = re.search('^scp ([\w\.]+) (\w+)@([\w\.]+):(.+)', str1) if m: print(m.group(1), m.group(2), m.group(3), m.group(4))
在DataFrame
中使用正則:
df[['室', '廳', '衛']] = df['戶型'].str.extract(r'(\d+)室(\d+)廳(\d+)衛', expand=False) # 室,廳,衛等信息
爬取新浪新聞:
import pandas as pd import requests from bs4 import BeautifulSoup as bs def getDetails(url, idx): if idx > 5: return print(url, 'url') res = requests.get(url) res.encoding = 'utf-8' d = bs(res.text, 'html.parser') title = d.select('.main-title')[0].text create_time = d.select('.date-source')[0].select('.date')[0].text source = d.select('.date-source')[0].select('.source')[0].text article = ' '.join(d.select('#article')[0].text.split()) keywords = d.select('#keywords')[0].text return { 'title': title, 'create_time': create_time, 'source': source, 'article': article, 'keywords': keywords } if __name__ == '__main__': url = 'https://news.sina.com.cn/china/' res = requests.get(url) res.encoding = 'utf-8' dsoup = bs(res.text, 'html.parser') news_herf = [h['href'] for h in dsoup.select('.left-content-1 div')[3].select('li a')] newArr = [] resultArr = [] for idx, new in enumerate(news_herf): t = getDetails(new, idx) if t: newArr.append(t) df = pd.DataFrame(newArr) df['keywords'] = df['keywords'].apply(lambda i: i.split(':')[1].split()) df['create_time'] = pd.to_datetime(df['create_time'], format=r'%Y年%m月%d日 %H:%M') df = df[['title', 'source', 'create_time', 'keywords', 'article']] # 轉換字段順序 df.to_json('news.json') print('ok')
敘述性統計
有系統的概括數據,瞭解數據的輪廓。
對數據樣本作敘述性,例如:平均數,標準誤差,計次頻率,百分比
對數據資料的圖像化處理,將數據摘要變爲圖表
常常更加偏重於敘述性統計處理可視化數據.
獲取股價:pandas-datareader
import pandas_datareader as pdd df = pdd.DataReader('BABA', data_source='yahoo') print(df.tail())
簡易的統計單個字段:
算出總和:df['volume'].sum()
算出平均:df['volume'].mean()
算出標準差:df['volume'].std()
取得最小值:df['volume'].min()
取得最大值:df['volume'].max()
取得記錄數:df['volume'].count()
取得總體敘述性統計: df.describe()
import pandas_datareader as pdd df = pdd.DataReader('BABA', data_source='yahoo') # 計算漲跌 df['diff'] = df['Close'] - df['Open'] df['rise'] = df['diff'] < 0 df['fall'] = df['diff'] > 0 # 計算每日報酬 df['ret'] = df['Close'].pct_change(1) print(df[['rise', 'fall']].sum()) # 計算漲跌次數 # print(df[df.index > '2018-08-01']) print(df.loc[df.index > '2018-08-01', ['rise', 'fall']].sum()) # 當月的漲跌次數 print(df.groupby([df.index.year, df.index.month])['rise', 'fall'].sum()) # 根據年月統計漲跌次數 print(df.groupby([df.index.year, df.index.month])['ret'].mean()) # 根據年月統計每個月的報酬
推論性統計
資料模型的建構
從樣本推論總體資料的概況
相關,迴歸,因素分析
人是視覺性的動物,百分之七十的接收數據經過眼睛,百分之三十的接收數據經過其它五官(嘴巴,鼻子,耳朵等)
信息圖表的功能
Storytelling
)Exploration
)信息可視化
pands繪製圖表
須要安裝matplotlib
模塊
import pandas_datareader as pdd df = pdd.DataReader('BABA', data_source='yahoo') # 繪製折線圖 df['Close'].plot(kind='line', figsize=[10, 5], legend=True, title='BABA', grid=True) # lengend=True 圖表 # grid 表格 # 繪製移動平均線 df['mvg30'] = df['Close'].rolling(window=30).mean() df[['Close', 'mvg30']].plot(kind='line', legend=True, figsize=[10, 5]) # 直方圖 df.ix[df.index >= '2017-04-01', 'Volume'].plot(kind='bar', figsize[10, 5], title='BABA', legend=True) # 餅圖 df['diff'] = df['Close'] - df['Open'] df['rise'] = df['diff'] > 0 df['fall'] = df['diff'] < 0 df[['rise', 'fall']].sum().plot(kind='pie', figsize[5,5], counterclock=Ture, startangle=90, legend=True)
將數據以結構化方式作存儲,讓用戶能夠透明結構化查詢語言(SQL),快速查詢及維護數據。
ACID
原則:
sqlite3
套件,組件。
import sqlite3 as lite con = lite.connect('test.sqlite') # 鏈接 cur = con.cursor() # 遊標 cur.execute('SELECT SQLITE_VERSION()') # 語句執行 data = cur.fetchone() # 獲取一條row print(data) con.close()
新增,查詢:
import sqlite3 as lite with lite.connect('test.sqlite') as con: cur = con.cursor() cur.execute('DROP TABLE IF EXISTS PhoneAddress') cur.execute('CREATE TABLE PhoneAddress(phone CHAR(10) PRIMARY KEY, address TEXT, name TEXT unique, age INT NOT NULL)') cur.execute('INSERT INTO PhoneAddress VALUES("245345345", "United", "Jsan", 50)') cur.execute('SELECT phone,address FROM PhoneAddress') data = cur.fetchall() for rec in data: print(rec[0], rec[1])
fetchone
和fetchall
獲取數據根據遊標cursor
,來獲取對應的數據。
操做的邏輯創建在遊標之上。
使用Pandas存儲數據
import sqlite3 as lite import pandas employee = [{ 'name': 'Mary', 'age': 24, 'gender': 'F' }] df = pandas.DataFrame(employee) with lite.connect('test.sqlite') as db: cur = db.cursor() df.to_sql(name='employee', index=False, con=db, if_exists='replace') d = pandas.read_sql('SELECT * FROM employee', con=db) # 可使用SQL語句(聚合查詢,排序語句)讀取數據,pandas轉換成DataFrame格式 print(d) cur.execute('SELECT * FROM employee') data = cur.fetchone() print(data)
獲取國家外匯管理局-人民幣匯率中間價
獲取數據並處理數據:
import sqlite3 as lite from datetime import datetime, timedelta import pandas as pd import requests from bs4 import BeautifulSoup url = 'http://www.safe.gov.cn/AppStructured/hlw/RMBQuery.do' ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6726.400 QQBrowser/10.2.2265.400' headers = { 'User-Agent': ua, 'Content-Type': 'text/html;charset=UTF-8' } def getCurrency(start, end): payload = { 'startDate': start, 'endDate': end, 'queryYN': True } html = requests.post(url, data=payload, headers=headers).text soup = BeautifulSoup(html, 'html.parser') dfs = pd.read_html(str(soup.select('#InfoTable')[0]), header=0)[0] # 讀取成DataFrame格式數據 # soup.select('#InfoTable')[0].prettify('UTF-8') 測試的時候出現中文亂碼 dfs = pd.melt(dfs, col_level=0, id_vars='日期') dfs.columns = ['date', 'currency', 'exchange'] with lite.connect('currency.sqlite') as db: dfs.to_sql(name='currency', index=None, con=db, if_exists='append') cur = db.cursor() cur.execute('SELECT * FROM currency') data = cur.fetchall() print(len(data)) if __name__ == '__main__': current_time = datetime.now() for i in range(1, 300, 30): start_date = (current_time - timedelta(days=i+30)).strftime('%Y-%m-%d') end_date = (current_time - timedelta(days=i+1)).strftime('%Y-%m-%d') print(start_date, end_date) getCurrency(start_date, end_date)
展現圖表數據:
import sqlite3 as lite import pandas as pd with lite.connect('currency.sqlite') as db: df = pd.read_sql('SELECT * FROM currency', con=db) df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') df.index = df.date print(df.head()) df.plot(kind='line', rot=30, color='blue')