001_python實現數據分析

時間 2019-11-13

原文原文鏈接

1、python

# coding:utf8
# !/usr/bin/python
# import numpy as np
import pandas as pd
import np

def example2():
    '''
    Describing a numeric ``Series``.
    :return:
    '''
    s = pd.Series([1, 2, 3])
    print s.describe()
    '''
    count    3.0     
    mean     2.0
    std      1.0
    min      1.0
    25%      1.5
    50%      2.0
    75%      2.5
    max      3.0
    dtype: float64
    '''
def example3():
    '''
    Describing a categorical ``Series``.
    :return:
    '''
    s = pd.Series(['a', 'a', 'b', 'c'])
    print s.describe()
    '''
    count     4
    unique    3
    top       a
    freq      2
    dtype: object
    '''
def example4():
    '''
    Describing a timestamp ``Series``.
    :return:
    '''
    s = pd.Series([
        np.datetime64("2000-01-01"),
        np.datetime64("2010-01-01"),
        np.datetime64("2010-01-01")
        ])
    print s.describe()
    '''
    count                       3
    unique                      2
    top       2010-01-01 00:00:00
    freq                        2
    first     2000-01-01 00:00:00
    last      2010-01-01 00:00:00
    dtype: object
    '''
def example5():
    '''
    Describing a ``DataFrame``. By default only numeric fields are returned.
    :return:
    '''
    df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),
                       'numeric': [1, 2, 3],
                        'object': ['a', 'b', 'c']})
    print df.describe()
    '''
    #Describing all columns of a ``DataFrame`` regardless of data type.
    print df.describe(include='all')
    #Describing a column from a ``DataFrame`` by accessing it as an attribute.
    print df.numeric.describe()
    #Including only numeric columns in a ``DataFrame`` description.
    print df.describe(include=[np.number])
    #Including only string columns in a ``DataFrame`` description.
    print df.describe(include=[np.object])
    #Including only categorical columns from a ``DataFrame`` description.
    print df.describe(include=['category'])
    #Excluding numeric columns from a ``DataFrame`` description.
    print df.describe(exclude=[np.number])
    #Excluding object columns from a ``DataFrame`` description.
    print df.describe(exclude=[np.object])
    '''
def example1():
    dic1={'000':{'a':1,'b':2,'c':3},'001':{'d':4,'e':5,'f':6}}
    df2=pd.DataFrame(dic1)
    # print df2.describe()
    '''
           000  001
    count  3.0  3.0
    mean   2.0  5.0
    std    1.0  1.0
    min    1.0  4.0
    25%    1.5  4.5
    50%    2.0  5.0
    75%    2.5  5.5
    max    3.0  6.0
    '''
    print "返回非NAN數據項數量=>count()\n{count}\n".format(count = df2.describe().count())
    print "返回中位數,等價第50位百分位數的值=>median()\n{median}\n".format(median = df2.describe().median())
    print "返回數據的衆值=>mode()\n{mode}\n".format(mode = df2.describe().mode())
    print "返回數據的標準差(描述離散度)=>std()\n{std}\n".format(std = df2.describe().std())
    print "返回方差=>var()\n{var}\n".format(var = df2.describe().var())
    print "偏態係數(skewness,表示數據分佈的對稱程度)=>skew()\n{skew}\n".format(skew = df2.describe().skew())

def main():
    example1()
if __name__ == '__main__':
    main()

輸出=>less

返回非NAN數據項數量=>count()
000    8
001    8
dtype: int64
返回中位數,等價第50位百分位數的值=>median()
000    2.00
001    4.75
dtype: float64
返回數據的衆值=>mode()
   000  001
0  1.0  5.0
1  2.0  NaN
2  3.0  NaN
返回數據的標準差(描述離散度)=>std()
000    0.801784
001    1.603567
dtype: float64
返回方差=>var()
000    0.642857
001    2.571429
dtype: float64
偏態係數(skewness,表示數據分佈的對稱程度)=>skew()
000    0.000000
001   -1.299187
dtype: float64