pd.value_counts(df.column_name) df.column_name.value_counts() Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)[source] Return a Series containing counts of unique values.
參數詳解html
normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : integer, optional Rather than count values, group them into half-open bins, a convenience for pd.cut, only works with numeric data. dropna : boolean, default True Don’t include counts of NaN.
參數示例講解python
index = pd.Index([3, 1, 2, 3, 4, np.nan]) index.value_counts() Out[144]: 3.0 2 4.0 1 2.0 1 1.0 1 dtype: int64 index.value_counts(normalize=True) Out[145]: 3.0 0.4 4.0 0.2 2.0 0.2 1.0 0.2 dtype: float64 index.value_counts(bins=3) Out[146]: (2.0, 3.0] 2 (0.996, 2.0] 2 (3.0, 4.0] 1 dtype: int64 index.value_counts(dropna=False) Out[148]: 3.0 2 NaN 1 4.0 1 2.0 1 1.0 1 dtype: int64
In [21]: data=pd.DataFrame(pd.Series([1,2,3,4,5,6,11,1,1,1,1,2,2,2,2,3]).values.reshape(4,4),columns=['a','b','c','d']) In [22]: data Out[22]: a b c d 0 1 2 3 4 1 5 6 11 1 2 1 1 1 2 3 2 2 2 3 In [23]: pd.value_counts(data.a) Out[23]: 1 2 2 1 5 1 Name: a, dtype: int64 In [26]: pd.value_counts(data.a).sort_index() Out[26]: 1 2 2 1 5 1 Name: a, dtype: int64 In [27]: pd.value_counts(data.a).sort_index().index Out[27]: Int64Index([1, 2, 5], dtype='int64') In [28]: pd.value_counts(data.a).sort_index().values Out[28]: array([2, 1, 1], dtype=int64)
# 方式一 cat_uniques = [] for cat in cat_features: cat_uniques.append(len(train[cat].unique())) uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)]) # 方式二 list(map(lambda x: len(train[x]),cat_featrues))
def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns keep : {'first', 'last', False}, default 'first' - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. inplace : boolean, default False Whether to drop duplicates in place or to return a copy Returns ------- deduplicated : DataFrame """
dataframe.colname.duplicated()
dataframe.duplicated()
dataframe.duplicated(subset = [])
dataframe.drop_duplicats()
dataframe.duplicated(keep = "first") dataframe.duplicated(keep = "last")
也能夠設置布爾類型,當設爲False時候,重複項將都被顯示。數組
dataframe.duplicated(keep = "False")
import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') print(df.head(1)) lat lng desc \ 0 40.297876 -75.581294 REINDEER CT & DEAD END; NEW HANOVER; Station ... zip title timeStamp twp \ 0 19525.0 EMS: BACK PAINS/INJURY 2015-12-10 17:10:52 NEW HANOVER addr e 0 REINDEER CT & DEAD END 1 df.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 249737 entries, 0 to 249736 Data columns (total 9 columns): lat 249737 non-null float64 lng 249737 non-null float64 desc 249737 non-null object zip 219391 non-null float64 title 249737 non-null object timeStamp 249737 non-null object twp 249644 non-null object addr 249737 non-null object e 249737 non-null int64 dtypes: float64(3), int64(1), object(5) memory usage: 17.1+ MB #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = list(set([i[0] for i in temp_list])) cate_list Out[152]: ['Fire', 'Traffic', 'EMS'] #構造全爲0的數組 zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list) #賦值 for cate in cate_list: zeros_df[cate][df.title.str.contains(cate)] = 1 print(zeros_df) Fire Traffic EMS 0 0.0 0.0 1.0 1 0.0 0.0 1.0 2 1.0 0.0 0.0 3 0.0 0.0 1.0 4 0.0 0.0 1.0 5 0.0 0.0 1.0 6 0.0 0.0 1.0 7 0.0 0.0 1.0 8 0.0 0.0 1.0 9 0.0 1.0 0.0 10 0.0 1.0 0.0 11 0.0 1.0 0.0 12 0.0 1.0 0.0 13 0.0 1.0 0.0 14 0.0 1.0 0.0 15 0.0 1.0 0.0 16 0.0 0.0 1.0 17 0.0 0.0 1.0 18 0.0 0.0 1.0 19 0.0 1.0 0.0 20 0.0 1.0 0.0 21 0.0 1.0 0.0 22 1.0 0.0 0.0 23 0.0 1.0 0.0 24 0.0 1.0 0.0 25 0.0 0.0 1.0 26 0.0 0.0 1.0 27 1.0 0.0 0.0 28 0.0 1.0 0.0 29 0.0 1.0 0.0 ... ... ... 249707 0.0 1.0 0.0 249708 1.0 0.0 0.0 249709 0.0 0.0 1.0 249710 0.0 1.0 0.0 249711 0.0 1.0 0.0 249712 0.0 0.0 1.0 249713 1.0 0.0 0.0 249714 1.0 0.0 0.0 249715 0.0 1.0 0.0 249716 0.0 0.0 1.0 249717 0.0 0.0 1.0 249718 1.0 0.0 0.0 249719 0.0 0.0 1.0 249720 0.0 0.0 1.0 249721 0.0 0.0 1.0 249722 0.0 1.0 0.0 249723 0.0 0.0 1.0 249724 0.0 0.0 1.0 249725 0.0 0.0 1.0 249726 1.0 0.0 0.0 249727 1.0 0.0 0.0 249728 0.0 1.0 0.0 249729 0.0 0.0 1.0 249730 0.0 0.0 1.0 249731 0.0 1.0 0.0 249732 0.0 0.0 1.0 249733 0.0 0.0 1.0 249734 0.0 0.0 1.0 249735 1.0 0.0 0.0 249736 0.0 1.0 0.0 [249737 rows x 3 columns] sum_ret = zeros_df.sum(axis=0) print(sum_ret) Fire 37432.0 Traffic 87465.0 EMS 124844.0 dtype: float64
df = pd.read_csv('911.csv') # print(df.head(1)) # print(df.info()) #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = [i[0] for i in temp_list] df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1)) print(df.groupby(by='cate').count()['title']) cate EMS 124840 Fire 37432 Traffic 87465 Name: title, dtype: int64
pd.cut(data['col_names'], bins, labels=None)
實例app
import numpy import pandas from pandas import read_csv data = read_csv('E:/python/data_analysis/data/dis-cut.csv') bins = [data['年齡'].min()-1,20,30,40,max(data.年齡)+1] labels = ['20歲及如下','21歲到30歲','31歲到40歲','41歲以上'] age_cut = pandas.cut(data.年齡,bins,labels=labels) data['年齡分層'] = age_cut result = data.groupby(by=['年齡分層'])['年齡'].agg(['size','mean']) result.rename(columns= {'size': '人數','mean': '平均年齡'}) Out[171]: 人數 平均年齡 年齡分層 20歲及如下 2061 19.302280 21歲到30歲 46858 25.759081 31歲到40歲 8729 33.095773 41歲以上 1453 50.625602
import pandas from pandas import read_csv data = read_csv('E:/python/data_analysis/data/pivot_table.csv') bins = [data['年齡'].min() - 1, 20, 30, 40, max(data.年齡) + 1] labels = ['20歲及如下', '21歲到30歲', '31歲到40歲', '41歲以上'] age_cut = pandas.cut(data.年齡, bins, labels=labels) data['年齡分層'] = age_cut r1 = data.pivot_table( values=['年齡'], index=['年齡分層'], columns=['性別'], aggfunc=[numpy.size, numpy.mean] ) r2 = data.pivot_table( values=['年齡'], index=['年齡分層'], columns=['性別'], aggfunc=[numpy.std], ) print(r1.index) print(r1['size']['年齡']['女']) print(r1.join(r2)) CategoricalIndex(['41歲以上', '21歲到30歲', '31歲到40歲', '20歲及如下'], categories=['20歲及如下', '21歲到30歲', '31歲到40歲', '41歲以上'], ordered=True, name='年齡分層', dtype='category') 年齡分層 41歲以上 111 21歲到30歲 2903 31歲到40歲 735 20歲及如下 567 Name: 女, dtype: int64 size mean std 年齡 年齡 年齡 性別 女 男 女 男 女 男 年齡分層 41歲以上 111 1950 18.972973 19.321026 1.708053 1.044185 21歲到30歲 2903 43955 25.954874 25.746149 2.453642 2.361298 31歲到40歲 735 7994 33.213605 33.084939 2.316704 2.200319 20歲及如下 567 886 51.691358 49.943567 6.761848 7.914171
import pandas as pd import numpy as np data = pd.DataFrame({'Sample': range(1, 11), 'Gender': ['Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female'], 'Handedness': ['Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed']}) #假設咱們想要根據性別和用手習慣對這段數據進行#統計彙總。雖然能夠用pivot_table()實現該功#能,可是pandas.crosstab()函數會更方便: # 方法一:用pivot_table # 其實我覺的一點都不麻煩ε=(´ο`*)))唉 data.pivot_table(index=['Gender'], columns='Handedness', aggfunc=len, margins=True) Out[173]: Sample Handedness Left-handed Right-handed All Gender Female 1 4 5 Male 2 3 5 All 3 7 10 # 方法二:用crosstab pd.crosstab(data.Gender, data.Handedness, margins=True) Out[174]: Handedness Left-handed Right-handed All Gender Female 1 4 5 Male 2 3 5 All 3 7 10
具體使用參照 http://www.javashuo.com/article/p-ofkgunav-ea.htmlide
pandas dataframe數據所有輸出,數據太多也不用省略號表示。函數
pd.set_option('display.max_columns',None)
或者spa
with option_context('display.max_rows', 10, 'display.max_columns', 5): 代碼邏輯