pandas是基於numpy構建的庫,在數據處理方面能夠把它理解爲numpy的增強版,因爲numpy主要用於科學計算,特長不在於數據處理,咱們日常處理的數據通常帶有列標籤和index索引,這時pandas做爲數據分析包而被開發出來。html
pandas數據結構(Series/DataFrame)python
import pandas as pd
import numpy as np
# 建立Series
a1 = pd.Series([1, 2, 3]) # 數組生成Series
a1
a2 = pd.Series(np.array([1, 2, 3])) # numpy數組生成Series
a2
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"]) # 指定標籤index生成
a3
a4 = pd.Series({"index1": 1, "index2": 2, "index3": 3}) # 字典生成Series
a4
a5 = pd.Series({"index": 1, "index2": 2, "index3": 3},
index=["index1", "index2", "index3"]) # 字典生成Series,指定index,不匹配部分爲NaN
a5
a6 = pd.Series(10, index=["index1", "index2", "index3"])
a6
a1 = pd.Series([1, 2, 3])
a1.index # Series索引
a1.values # Series數值
a1.name = "population" # 指定Series名字
a1.index.name = "state" # 指定Series索引名字
a1
a1.shape
a1.size
s[indexname]
s.loc[indexname] 推薦
s[loc]
s.iloc[loc] 推薦<函數
s[[indexname1,indexname2]]
s.loc[[indexname1,indexname2]] 推薦
s[[loc1,loc2]]
s.iloc[[loc1,loc2]] 推薦ui
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"])
a3
a3["index1"]
a3.loc['index1']
a3[1]
a3.iloc[1]
a3[['index1','index2']]
a3.loc[['index1','index2']]
a3[[1,2]]
a3.iloc[[1,2]]
a3[a3 > np.mean(a3)] # 布爾值查找元素
a3[0:2] # 絕對位置切片
a3["index1":"index2"] # 索引切片
# 修改元素
a3["index3"] = 100 # 按照索引修改元素
a3
a3[2] = 1000 # 按照絕對位置修改元素
a3
# 添加元素
a3["index4"] = 10 # 按照索引添加元素
a3
a3.drop(["index4", "index3"], inplace=True) # inplace=True表示做用在當前Series
a3
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"])
a3["index3"] = np.NaN # 添加元素
a3
a3.isnull() # 判斷Series是否有缺失值
a3.notnull() # 判斷Series是否沒有缺失值
"index1" in a3 # 判斷Series中某個索引是否存在
a3.isin([1,2]) # 判斷Series中某個值是否存在
a3.unique() # 統計Series中去重元素
a3.value_counts() # 統計Series中去重元素和個數
data = {"color": ["green", "red", "blue", "black", "yellow"], "price": [1, 2, 3, 4, 5]}
dataFrame1 = pd.DataFrame(data=data) # 經過字典建立
dataFrame1
dataFrame2 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"])
dataFrame2
dataFrame3 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"],
columns=["price"]) # 指定列索引
dataFrame3
dataFrame4 = pd.DataFrame(data=np.arange(12).reshape(3, 4)) # 經過numpy數組建立
dataFrame4
dic = {
'張三':[150,150,150,300],
'李四':[0,0,0,0]
}
pd.DataFrame(data=dic,index=['語文','數學','英語','理綜'])
data = [[0,150],[0,150],[0,150],[0,300]]
index = ['語文','數學','英語','理綜']
columns = ['李四','張三']
pd.DataFrame(data=data,index=index,columns=columns)
1.2經過Series建立
cars = pd.Series({"Beijing": 300000, "Shanghai": 350000, "Shenzhen": 300000, "Tianjian": 200000, "Guangzhou": 250000,
"Chongqing": 150000})
cars
cities = {"Shanghai": 90000, "Foshan": 4500, "Dongguan": 5500, "Beijing": 6600, "Nanjing": 8000, "Lanzhou": None}
apts = pd.Series(cities, name="price")
apts
df = pd.DataFrame({"apts": apts, "cars": cars})
df
1.3經過dicts的list來構建Dataframe
data = [{"Beijing": 1000, "Shanghai": 2500, "Nanjing": 9850}, {"Beijing": 5000, "Shanghai": 4600, "Nanjing": 7000}]
pd.DataFrame(data)
dataFrame2.columns # 查找dataFrame中全部列標籤
dataFrame2.index # 查找dataFrame中的全部行標籤
dataFrame2.values # 查找dataFrame中的全部值
dataFrame2["color"]["index1"] # 索引查找數值(先列後行,不然報錯)
dataFrame2.at["index1", "color"] # 索引查找數值(先行後列,不然報錯)
dataFrame2.iat[0, 1] # 絕對位置查找數值
data = {"color": ["green", "red", "blue", "black", "yellow"], "price": [1, 2, 3, 4, 5]}
dataFrame2 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"])
dataFrame2
dataFrame2.loc["index1"] # 查找一行元素
dataFrame2.iloc[0] # 查找一行元素(絕對位置)
dataFrame2.iloc[0:2] # 經過iloc方法能夠拿到行和列,直接按照index的順序來取。# 能夠當作numpy的ndarray的二維數組來操做。
dataFrame2.loc[:, "price"] # 查找一列元素
dataFrame2.iloc[:, 0] # 查找一列元素(絕對位置)
dataFrame2.values[0] # 查找一行元素
dataFrame2["price"] # 查找一列元素,#經過列名的方式,查找列,不能查找行
dataFrame2["color"]
dataFrame2.head(5) # 查看前5行元素
dataFrame2.tail(5) # 查看後5行元素
dataFrame2["index1":"index4"] # 切片多行
dataFrame2[0:4] # 切片多行
dataFrame2.loc[["index1", "index2"]] # 多行
dataFrame2.iloc[[0, 1]] # 多行
dataFrame2.loc[:, ["price"]] # 多列
dataFrame2.iloc[:, [0, 1]] # 多列
dataFrame2.loc[["index1", "index3"], ["price"]] # 索引查找
dataFrame2.iloc[[1, 2], [0]] # 絕對位置查找
dataFrame2.loc["index6"]=10
dataFrame2
dataFrame2.iloc[5] = 10
dataFrame2
dataFrame2.loc["index7"] = 100
dataFrame2
dataFrame2.loc[:, "size"] = "small"
dataFrame2
dataFrame2.iloc[:, 2] = 10
dataFrame2
dataFrame2.iloc[0, 1] = 10
dataFrame2
dataFrame2.at["index1", "price"] = 100
dataFrame2
dataFrame2.iat[0, 1] = 1000
dataFrame2
a=dataFrame2.drop(["price"], axis=1, inplace=False)
dataFrame2
a
8.1刪除NaN數據
re=df.dropna(axis=1, inplace=False) # inplace默認爲false
df
re
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df2
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
df3
# ignore_index=True將從新對index排序
pd.concat([df1, df2, df3], axis=0, ignore_index=True)
# ignore_index=True將從新對index排序
pd.concat([df1, df2, df3], axis=0, ignore_index=False)
join參數用法
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# join默認爲'outer',不共有的列用NaN填充
pd.concat([df1, df2], sort=False, join='outer')
# join='inner'只合並共有的列
pd.concat([df1, df2], sort=False, join='inner',ignore_index=True)
join_axes參數用法
# 按照df1的index進行合併
pd.concat([df1, df2], axis=1, join_axes=[df1.index])
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
re = df1.append(df2, ignore_index=True)
re
append一組數據
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
s = pd.Series([4, 4, 4, 4], index=['a', 'b', 'c', 'd'])
re = df1.append(s, ignore_index=True)
re
df2
re = pd.merge(df1, df2, on='KEY')
re
基於某兩列進行合併
df1 = pd.DataFrame({'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3'],
'KEY1': ['K1', 'K2', 'K0'],
'KEY2': ['K0', 'K1', 'K3']})
df2 = pd.DataFrame({'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3'],
'KEY1': ['K0', 'K2', 'K1'],
'KEY2': ['K1', 'K1', 'K0']})
# how:['left','right','outer','inner']
re = pd.merge(df1, df2, on=['KEY1', 'KEY2'], how='inner')
re
按index合併
df1 = pd.DataFrame({'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3']},
index=['K0', 'K1', 'K2'])
df2 = pd.DataFrame({'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3']},
index=['K0', 'K1', 'K3'])
re = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
re
爲列加後綴
df_boys = pd.DataFrame({'id': ['1', '2', '3'],
'age': ['23', '25', '18']})
df_girls = pd.DataFrame({'id': ['1', '2', '3'],
'age': ['18', '18', '18']})
re = pd.merge(df_boys, df_girls, on='id', suffixes=['_boys', '_girls'])
re