pandas再次學習

時間 2019-12-10

標籤 pandas 再次學習简体版

原文原文鏈接

numpy、scipy官方文檔 pandas官方網站 matplotlib官方文檔數據結構

1、數據結構

2、數據處理

一、數據獲取（excel文件數據基本信息）

#coding=utf-8
import pandas as pd
import numpy as np

excel_data = pd.read_excel("test.xlsx")
print excel_data.shape            #顯示數據多少行多少列
print excel_data.index            #顯示數據全部行的索引數
print excel_data.columns          #顯示數據全部列的列名
print excel_data.info             #顯示全部列的列名
print excel_data.dtypes           #顯示數據的類型

輸出：app

'''
   name  age       time adress  home
0   cat  2.0 1900-01-01      a   NaN
1   dog  3.0 1900-01-02      b   NaN
2   pig  4.0 1900-01-03      c   NaN
3  bird  5.0        NaT      d   NaN
4   NaN  6.0 1900-01-02      e   NaN
5   pig  7.0 1900-01-03    NaN   NaN
6  bird  NaN        NaT    NaN   NaN
'''

excel_data

'''
(7, 5)
'''

excel_data.shape

'''
RangeIndex(start=0, stop=7, step=1)
'''

excel_data.index

'''
Index([u'name', u'age', u'time', u'adress', u'home'], dtype='object')
'''

excel_data.columns

'''
<bound method DataFrame.info of    name  age       time adress  home
0   cat  2.0 1900-01-01      a   NaN
1   dog  3.0 1900-01-02      b   NaN
2   pig  4.0 1900-01-03      c   NaN
3  bird  5.0        NaT      d   NaN
4   NaN  6.0 1900-01-02      e   NaN
5   pig  7.0 1900-01-03    NaN   NaN
6  bird  NaN        NaT    NaN   NaN>
'''

excel_data.info

'''
name              object
age              float64
time      datetime64[ns]
adress            object
home             float64
dtype: object
'''

excel_data.dtypes

#Help on function read_excel in module pandas.io.excel:

read_excel(*args, **kwargs)
    Read an Excel table into a pandas DataFrame
    
    Parameters
    ----------
    io : string, path object (pathlib.Path or py._path.local.LocalPath),
        file-like object, pandas ExcelFile, or xlrd workbook.
        The string could be a URL. Valid URL schemes include http, ftp, s3,
        and file. For file URLs, a host is expected. For instance, a local
        file could be file://localhost/path/to/workbook.xlsx
    sheet_name : string, int, mixed list of strings/ints, or None, default 0
    
        Strings are used for sheet names, Integers are used in zero-indexed
        sheet positions.
    
        Lists of strings/integers are used to request multiple sheets.
    
        Specify None to get all sheets.
    
        str|int -> DataFrame is returned.
        list|None -> Dict of DataFrames is returned, with keys representing
        sheets.
    
        Available Cases
    
        * Defaults to 0 -> 1st sheet as a DataFrame
        * 1 -> 2nd sheet as a DataFrame
        * "Sheet1" -> 1st sheet as a DataFrame
        * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames
        * None -> All sheets as a dictionary of DataFrames
    
    sheetname : string, int, mixed list of strings/ints, or None, default 0
    
        .. deprecated:: 0.21.0
           Use `sheet_name` instead
    
    header : int, list of ints, default 0
        Row (0-indexed) to use for the column labels of the parsed
        DataFrame. If a list of integers is passed those row positions will
        be combined into a ``MultiIndex``. Use None if there is no header.
    names : array-like, default None
        List of column names to use. If file contains no header row,
        then you should explicitly pass header=None
    index_col : int, list of ints, default None
        Column (0-indexed) to use as the row labels of the DataFrame.
        Pass None if there is no such column.  If a list is passed,
        those columns will be combined into a ``MultiIndex``.  If a
        subset of data is selected with ``usecols``, index_col
        is based on the subset.
    parse_cols : int or list, default None
    
        .. deprecated:: 0.21.0
           Pass in `usecols` instead.
    
    usecols : int or list, default None
        * If None then parse all columns,
        * If int then indicates last column to be parsed
        * If list of ints then indicates list of column numbers to be parsed
        * If string then indicates comma separated list of Excel column letters and
          column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
          both sides.
    squeeze : boolean, default False
        If the parsed data only contains one column then return a Series
    dtype : Type name or dict of column -> type, default None
        Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
        Use `object` to preserve data as stored in Excel and not interpret dtype.
        If converters are specified, they will be applied INSTEAD
        of dtype conversion.
    
        .. versionadded:: 0.20.0
    
    engine: string, default None
        If io is not a buffer or path, this must be set to identify io.
        Acceptable values are None or xlrd
    converters : dict, default None
        Dict of functions for converting values in certain columns. Keys can
        either be integers or column labels, values are functions that take one
        input argument, the Excel cell content, and return the transformed
        content.
    true_values : list, default None
        Values to consider as True
    
        .. versionadded:: 0.19.0
    
    false_values : list, default None
        Values to consider as False
    
        .. versionadded:: 0.19.0
    
    skiprows : list-like
        Rows to skip at the beginning (0-indexed)
    nrows : int, default None
        Number of rows to parse
    
        .. versionadded:: 0.23.0
    
    na_values : scalar, str, list-like, or dict, default None
        Additional strings to recognize as NA/NaN. If dict passed, specific
        per-column NA values. By default the following values are interpreted
        as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan',
        'null'.
    keep_default_na : bool, default True
        If na_values are specified and keep_default_na is False the default NaN
        values are overridden, otherwise they're appended to.
    verbose : boolean, default False
        Indicate number of NA values placed in non-numeric columns
    thousands : str, default None
        Thousands separator for parsing string columns to numeric.  Note that
        this parameter is only necessary for columns stored as TEXT in Excel,
        any numeric columns will automatically be parsed, regardless of display
        format.
    comment : str, default None
        Comments out remainder of line. Pass a character or characters to this
        argument to indicate comments in the input file. Any data between the
        comment string and the end of the current line is ignored.
    skip_footer : int, default 0
    
        .. deprecated:: 0.23.0
           Pass in `skipfooter` instead.
    skipfooter : int, default 0
        Rows at the end to skip (0-indexed)
    convert_float : boolean, default True
        convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
        data will be read in as floats: Excel stores all numbers as floats
        internally
    
    Returns
    -------
    parsed : DataFrame or Dict of DataFrames
        DataFrame from the passed in Excel file.  See notes in sheet_name
        argument for more information on when a Dict of Dataframes is returned.

read_excel參數解析

獲取行
excel_data.head(5)                   #顯示數據的前5行
excel_data.tail(5)                      #顯示數據的後5行
excel_data.loc[0]                       #獲取第一行的數據
excel_data.loc[2:4]                    #返回第3行到第4行的數據
excel_data.loc[[2,5,10]]             #返回行標號爲2，5，10三行數據，注意必須是由列表包含起來的數據。
excel_data.iloc[0]                       #獲取第一行

獲取列 
excel_data["name"]                      #返回這一列("name")的數據
excel_data[["name","age"]]          #返回列名爲name和 age的兩列數據
excel_data["name"].unique()         #顯示數據name列的全部惟一值, 有0值是由於對數據缺失值進行了填充

獲取某行某列
excel_data.head(5)["name"]                 #獲取前5行的name列
excel_data.head(5)["name"][0]             #獲取前5行的name列的元素值
excel_data.at[1,"age"]                          #表示取第二行"age"列的數據
excel_data.loc[0]["name"]                     #獲取第一行且列名爲name的數據
excel_data.loc[:,"age"]                          #獲取age的那一列,這個冒號的意思是全部行，逗號表示行與列的區分
excel_data.loc[:,["age","time"]]             #獲取全部行的age列和time列的數據
excel_data.loc[1,["age","time"]]             #獲取第二行的age和time列的數據
excel_data.iloc[0:2,0:2]                          #獲取前兩行前兩列的數據
excel_data.iloc[[1,2,4],[0,2]]                   #獲取第1，2，4行中的0，2列的數據


獲取空值
excel_data.notnull()                    #excel_data的非空值爲True
excel_data.isnull()                      #isnull是Python中檢驗空值的函數，返回的結果是邏輯值，包含空值返回True，不包含則返回False。能夠對整個數據表進行檢查，也能夠單獨對某一列進行空值檢查。

行列數據獲取

二、數據清洗轉換

1）增

2）刪

a、刪除無效行、列（整行、列都是空白，且說明無效的行、列）less

b、刪除指定行、列ide

Help on method drop in module pandas.core.frame:

drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') method of pandas.core.frame.DataFrame instance
    Drop specified labels from rows or columns.
    
    Remove rows or columns by specifying label names and corresponding
    axis, or by specifying directly index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level.
    
    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels from the index (0 or 'index') or
        columns (1 or 'columns').
    index, columns : single label or list-like
        Alternative to specifying axis (``labels, axis=1``
        is equivalent to ``columns=labels``).
    
        .. versionadded:: 0.21.0
    level : int or level name, optional
        For MultiIndex, level from which the labels will be removed.
    inplace : bool, default False
        If True, do operation inplace and return None.
    errors : {'ignore', 'raise'}, default 'raise'
        If 'ignore', suppress error and only existing labels are
        dropped.

excel_data.drop

#Help on method dropna in module pandas.core.frame:

dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False) method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. deprecated:: 0.23.0: Pass tuple or list to drop on multiple
        axes.
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
        * 'any' : If any NA values are present, drop that row or column.
        * 'all' : If all values are NA, drop that row or column.
    thresh : int, optional
        Require that many non-NA values.
    subset : array-like, optional
        Labels along other axis to consider, e.g. if you are dropping rows
        these would be a list of columns to include.
    inplace : bool, default False
        If True, do operation inplace and return None.

excel_data.dropna

3）改

#Help on method fillna in module pandas.core.frame:

fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Fill NA/NaN values using the specified method
    
    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame). (values not
        in the dict/Series/DataFrame will not be filled). This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use NEXT valid observation to fill gap
    axis : {0 or 'index', 1 or 'columns'}
    inplace : boolean, default False
        If True, fill in place. Note: this will modify any
        other views on this object, (e.g. a no-copy slice for a column in a
        DataFrame).
    limit : int, default None
        If method is specified, this is the maximum number of consecutive
        NaN values to forward/backward fill. In other words, if there is
        a gap with more than this number of consecutive NaNs, it will only
        be partially filled. If method is not specified, this is the
        maximum number of entries along the entire axis where NaNs will be
        filled. Must be greater than 0 if not None.
    downcast : dict, default is None
        a dict of item->dtype of what to downcast if possible,
        or the string 'infer' which will try to downcast to an appropriate
        equal type (e.g. float64 to int64 if possible)