1、python
# coding:utf8 # !/usr/bin/python # import numpy as np import pandas as pd import np def example2(): ''' Describing a numeric ``Series``. :return: ''' s = pd.Series([1, 2, 3]) print s.describe() ''' count 3.0 mean 2.0 std 1.0 min 1.0 25% 1.5 50% 2.0 75% 2.5 max 3.0 dtype: float64 ''' def example3(): ''' Describing a categorical ``Series``. :return: ''' s = pd.Series(['a', 'a', 'b', 'c']) print s.describe() ''' count 4 unique 3 top a freq 2 dtype: object ''' def example4(): ''' Describing a timestamp ``Series``. :return: ''' s = pd.Series([ np.datetime64("2000-01-01"), np.datetime64("2010-01-01"), np.datetime64("2010-01-01") ]) print s.describe() ''' count 3 unique 2 top 2010-01-01 00:00:00 freq 2 first 2000-01-01 00:00:00 last 2010-01-01 00:00:00 dtype: object ''' def example5(): ''' Describing a ``DataFrame``. By default only numeric fields are returned. :return: ''' df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), 'numeric': [1, 2, 3], 'object': ['a', 'b', 'c']}) print df.describe() ''' #Describing all columns of a ``DataFrame`` regardless of data type. print df.describe(include='all') #Describing a column from a ``DataFrame`` by accessing it as an attribute. print df.numeric.describe() #Including only numeric columns in a ``DataFrame`` description. print df.describe(include=[np.number]) #Including only string columns in a ``DataFrame`` description. print df.describe(include=[np.object]) #Including only categorical columns from a ``DataFrame`` description. print df.describe(include=['category']) #Excluding numeric columns from a ``DataFrame`` description. print df.describe(exclude=[np.number]) #Excluding object columns from a ``DataFrame`` description. print df.describe(exclude=[np.object]) ''' def example1(): dic1={'000':{'a':1,'b':2,'c':3},'001':{'d':4,'e':5,'f':6}} df2=pd.DataFrame(dic1) # print df2.describe() ''' 000 001 count 3.0 3.0 mean 2.0 5.0 std 1.0 1.0 min 1.0 4.0 25% 1.5 4.5 50% 2.0 5.0 75% 2.5 5.5 max 3.0 6.0 ''' print "返回非NAN數據項數量=>count()\n{count}\n".format(count = df2.describe().count()) print "返回中位數,等價第50位百分位數的值=>median()\n{median}\n".format(median = df2.describe().median()) print "返回數據的衆值=>mode()\n{mode}\n".format(mode = df2.describe().mode()) print "返回數據的標準差(描述離散度)=>std()\n{std}\n".format(std = df2.describe().std()) print "返回方差=>var()\n{var}\n".format(var = df2.describe().var()) print "偏態係數(skewness,表示數據分佈的對稱程度)=>skew()\n{skew}\n".format(skew = df2.describe().skew()) def main(): example1() if __name__ == '__main__': main()
輸出=>less
返回非NAN數據項數量=>count() 000 8 001 8 dtype: int64 返回中位數,等價第50位百分位數的值=>median() 000 2.00 001 4.75 dtype: float64 返回數據的衆值=>mode() 000 001 0 1.0 5.0 1 2.0 NaN 2 3.0 NaN 返回數據的標準差(描述離散度)=>std() 000 0.801784 001 1.603567 dtype: float64 返回方差=>var() 000 0.642857 001 2.571429 dtype: float64 偏態係數(skewness,表示數據分佈的對稱程度)=>skew() 000 0.000000 001 -1.299187 dtype: float64