DataFramepython
DataFrame是一個表格型的數據結構,含有一組有序的列,是一個二維結構。正則表達式
DataFrame能夠被看作是由Series組成的字典,而且共用一個索引。數據庫
import numpy as np import pandas as pd
a=pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']), 'two':pd.Series([1,2,3,4],index=['b','a','c','d'])}) a
能夠看出 有one和two兩個Series組成,而且共用一組索引a,b,c,djson
# 字典方式建立 b=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]}) b
# 自定義索引 c=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]},index=list("abcd")) c
df = pd.read_csv("d:/601318.csv") df
2470 rows × 8 columns數組
x=open("d:/601318.csv") df=pd.read_csv(x) df
2470 rows × 8 columns數據結構
1 # 保存到文件 2 df.to_csv("d:/new.csv") 3 4 5 # index 獲取行索引 6 df.index 7 8 RangeIndex(start=0, stop=2470, step=1) 9 10 a.index 11 12 Index(['a', 'b', 'c', 'd'], dtype='object') 13 14 15 # 返回列索引 16 df.columns 17 18 Index(['id', 'date', 'open', 'close', 'high', 'low', 'volume', 'code'], dtype='object') 19 20 21 # values 返回二維數組 22 df.values 23 24 array([ 25 [0, '2007/3/1', 22.074, ..., 20.22, 1977633.51, 601318], 26 [1, '2007/3/2', 20.75, ..., 20.256, 425048.32, 601318], 27 [2, '2007/3/5', 20.3, ..., 19.218, 419196.74, 601318], 28 ..., 29 [2467, '2017/7/28', 52.2, ..., 51.8, 491294.0, 601318], 30 [2468, '2017/7/31', 51.88, ..., 51.41, 616005.0, 601318], 31 [2469, '2017/8/1', 52.2, ..., 52.2, 1147936.0, 601318] 32 ], 33 dtype=object) 34 35 36 # 倒置 行和列交換 37 38 a.T
# describe 按列打印一些統計信息 df.describe()
# df 的columns 和index都有name屬性 # 上面的數據中的index的name尚未值,能夠設置一個 df.index.name='indexname' df
2470 rows × 8 columnsapp
#獲取第一列的name df.columns[0] 'id' df.columns[1] 'date' # 給列重命名,並無修改原數據,這是下面是返回的數據 df.rename(columns={"close":"newclose","low":"newlow"})
2470 rows × 8 columns函數
df[0]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2441 try: -> 2442 return self._engine.get_loc(key) 2443 except KeyError: pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)() KeyError: 0 During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) <ipython-input-18-9ae93f22b889> in <module>() ----> 1 df[0] d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key) 1962 return self._getitem_multilevel(key) 1963 else: -> 1964 return self._getitem_column(key) 1965 1966 def _getitem_column(self, key): d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key) 1969 # get column 1970 if self.columns.is_unique: -> 1971 return self._get_item_cache(key) 1972 1973 # duplicate columns & possible reduce dimensionality d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item) 1643 res = cache.get(item) 1644 if res is None: -> 1645 values = self._data.get(item) 1646 res = self._box_item_values(item, values) 1647 cache[item] = res d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath) 3588 3589 if not isnull(item): -> 3590 loc = self.items.get_loc(item) 3591 else: 3592 indexer = np.arange(len(self.items))[isnull(self.items)] d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2442 return self._engine.get_loc(key) 2443 except KeyError: -> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2445 2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance) pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)() KeyError: 0
df["close"]
indexname 0 20.657 1 20.489 2 19.593 3 19.977 4 20.520 5 20.273 6 20.101 7 19.739 8 19.818 9 19.841 10 19.849 11 19.960 12 20.211 13 19.911 14 20.026 15 19.938 16 20.282 17 20.269 18 20.565 19 20.927 20 20.772 21 21.364 22 21.284 23 21.099 24 21.156 25 21.196 26 22.785 27 23.319 28 23.637 29 23.593 ... 2440 48.896 2441 48.609 2442 49.183 2443 49.183 2444 49.381 2445 48.085 2446 49.420 2447 49.074 2448 48.411 2449 47.403 2450 49.876 2451 50.835 2452 50.459 2453 50.578 2454 51.230 2455 50.610 2456 51.630 2457 52.770 2458 53.900 2459 53.470 2460 53.840 2461 54.010 2462 51.960 2463 52.610 2464 52.310 2465 51.890 2466 52.360 2467 51.890 2468 52.020 2469 54.850 Name: close, Length: 2470, dtype: float64
從上邊能夠看出,[]裏邊彷佛要用來選擇列才能夠(後面知道,切片也能夠)spa
# 花式索引 df[["close","low"]]
2470 rows × 2 columnscode
df["close"][0] 20.656999999999996
df[「close」] 先獲得一個Series,而後 再用標籤索引0去查找
df[["close","low"]][0]
1 --------------------------------------------------------------------------- 2 3 KeyError Traceback (most recent call last) 4 5 d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 6 2441 try: 7 -> 2442 return self._engine.get_loc(key) 8 2443 except KeyError: 9 10 11 pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)() 12 13 14 pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)() 15 16 17 pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)() 18 19 20 pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)() 21 22 23 KeyError: 0 24 25 26 During handling of the above exception, another exception occurred: 27 28 29 KeyError Traceback (most recent call last) 30 31 <ipython-input-22-7ed9e36ec1ab> in <module>() 32 ----> 1 df[["close","low"]][0] 33 34 35 d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key) 36 1962 return self._getitem_multilevel(key) 37 1963 else: 38 -> 1964 return self._getitem_column(key) 39 1965 40 1966 def _getitem_column(self, key): 41 42 43 d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key) 44 1969 # get column 45 1970 if self.columns.is_unique: 46 -> 1971 return self._get_item_cache(key) 47 1972 48 1973 # duplicate columns & possible reduce dimensionality 49 50 51 d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item) 52 1643 res = cache.get(item) 53 1644 if res is None: 54 -> 1645 values = self._data.get(item) 55 1646 res = self._box_item_values(item, values) 56 1647 cache[item] = res 57 58 59 d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath) 60 3588 61 3589 if not isnull(item): 62 -> 3590 loc = self.items.get_loc(item) 63 3591 else: 64 3592 indexer = np.arange(len(self.items))[isnull(self.items)] 65 66 67 d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 68 2442 return self._engine.get_loc(key) 69 2443 except KeyError: 70 -> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key)) 71 2445 72 2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance) 73 74 75 pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)() 76 77 78 pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)() 79 80 81 pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)() 82 83 84 pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)() 85 86 87 KeyError: 0
之因此報錯,是由於df[[「close」,「low」]]獲得的是一個DataFrame類型,它再加[],[]裏邊只能是列
# 切片,這個時候解釋的就是行 df[0:10]
推薦使用loc和iloc索引
# 在loc裏邊,逗號左邊表示行,右邊表示列 # 在這裏的0:10被解釋爲標籤(不是行的下標) ddf=df.loc[3:10,["close","low"]] ddf
# 那我如今想拿到ddf裏,"low"列,第5行的數據 # ddf["low"]獲得的是一個Series,其索引是整數的,因此必須使用iloc指明使用下標取值 ddf["low"].iloc[4] 19.646000000000001
布爾值索引
# 過濾某一列 df[df["close"]<20]
856 rows × 8 columns
# 過濾全部的位置 # dataframe會將全部位置上小於20的設置爲nan(由於其不能肯定該怎麼捨棄數據,不可能由於一行中一個nan就刪除整個一行或者一列) df[df<20]
2470 rows × 8 columns
# 將全部小於20的值改成0 # 請注意這裏,會將爲False的位置改成0,因此咱們要寫大於20,這樣的話小於20的纔是False df[df>20].fillna(0)
2470 rows × 8 columns
# 選擇date 爲2017/7/25 和2017/7/3 的值 # 這裏的date是字符串類型,不是datetime類型 df[(df["date"]=="2017/7/25") | (df["date"]=="2017/7/3")]
# 這裏還能夠用isin方法去過濾一個範圍 df[df["date"].isin(["2017/7/25","2017/7/3"])]
df[df["high"].isin([53.050,54.150])]
修改值的時候要注意類型的問題
# 好比要將全部小於20的位置變爲0 # 作法一: df[df>20].fillna(0) # 作法二:等號賦值 df[df<20]=0
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-45-ea838d192259> in <module>() 5 6 # 作大二:等號賦值 ----> 7 df[df<20]=0 d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value) 2326 self._setitem_array(key, value) 2327 elif isinstance(key, DataFrame): -> 2328 self._setitem_frame(key, value) 2329 else: 2330 # set column d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _setitem_frame(self, key, value) 2362 raise TypeError('Must pass DataFrame with boolean values only') 2363 -> 2364 self._check_inplace_setting(value) 2365 self._check_setitem_copy() 2366 self._where(-key, value, inplace=True) d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _check_inplace_setting(self, value) 3197 pass 3198 -> 3199 raise TypeError('Cannot do inplace boolean setting on ' 3200 'mixed-types with a non np.nan value') 3201 TypeError: Cannot do inplace boolean setting on mixed-types with a non np.nan value
報錯的緣由是由於,date這列是字符串類型,設置爲0,類型轉換失敗
# 如今經過切片,去掉date列,看可否轉換成功 df2=df.loc[:10,"open":"code"] df2
df2[df2<20]=0
df2
能夠看出,若是列裏邊沒有字符串類型,是能夠轉換成功的
df3=df + df2
df3
2470 rows × 8 columns
新的數據,列和行都要對齊,列date和id都是nan,是由於df2中沒有這兩列,這些其實跟Series的道理是同樣的
處理缺失數據的相關方法:
跟Series的方法是同樣的
df3.dropna()
在這裏,dropna默認的規則,只要行裏有nan,就會清除掉整行,可是能夠設置參數去改變
df3.dropna(how="any") ---->默認是any,只要有nan就刪除;how='all'的話,就是行裏全是nan才刪除
那若是我想對列進行操做,就還須要另一個才作,要記住默認的規則是對行的
df3.dropna(how="any",axis=0)--->axis默認等於0,表示是對行進行規則,axis=1的話,就表示對列進行規則
# 將位置是nan的地方替換爲0 df3.fillna(0)
2470 rows × 8 columns
mean 得出每一個列的平均值
df2.mean()
open 11.258000 close 9.276364 high 15.107000 low 5.513000 volume 388403.913636 code 601318.000000 dtype: float64
# 單列的平均值(Series) df2["close"].mean() 9.2763636363636355
sum 求出每列的和
字符串的話,就是字符串的拼接
df.sum()
id 3049215 date 2007/3/12007/3/22007/3/52007/3/62007/3/72007/3... open 63999.2 close 64054.2 high 65113.7 low 63035.4 volume 1.18105e+09 code 1485255460 dtype: object
sort 排序
sort_index 按照索引排序(行索引和列索引)
ascending默認爲True ,表示按照升序排序;False表示降序
axis爲0 ,表明按行索引;1表明用列索引 - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0)
# ascending默認爲True ,表示按照升序排序;False表示降序 df.sort_index(ascending=False)
2470 rows × 8 columns
# ascending默認爲True ,表示按照升序排序;False表示降序 df.sort_index(ascending=False)
2470 rows × 8 columns
sort_values 按照值排序
# 按照close列升序排序 df2.sort_values("close")
# 按照close列降序 df2.sort_values("close",ascending=False)
1 # 按照close列升序排序,若是有close值相同,再按照low列排序 2 3 df2.sort_values(["close","low"])
# axis=1,按照行排序,在這裏必定要注意,必須保證這一行的數據類型是一致的,好比df中有字符串類型,就會報錯 # df2 行類的數據類型都是一致的是沒有問題的,第一個參數是說按照行的索引號,df中,0和1的結果就不同 df2.sort_values(0,axis=1)
df2.sort_values(1,axis=1)
numpy的通用函數用眼適用於pandas
# 請主要類型 df.abs()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-98-db394c0c0cf4> in <module>() 1 # 請主要類型 2 ----> 3 df.abs() d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in abs(self) 5661 abs: type of caller 5662 """ -> 5663 return np.abs(self) 5664 5665 def describe(self, percentiles=None, include=None, exclude=None): TypeError: bad operand type for abs(): 'str'
df2.abs()
applymap(函數名),做用域DataFrame上,這個的函數的應用是針對於df裏的每一個位置去執行
apply(函數名),做用域DataFrame上,將操做應用於整列或者整行上(整行要修改axis=1)
map做用於Series上
import numpy as np import pandas as pd
df=pd.read_csv("d:/601318.csv") df
2470 rows × 8 columns
df2=df.loc[:15,"close":"code"] df2
#df2中每一個位置都是加10 df2.applymap(lambda x:x+10)
# map做用域Series df4=df2["close"] df4.map(lambda x:x+100)
0 120.657
1 120.489
2 119.593
3 119.977
4 120.520
5 120.273
6 120.101
7 119.739
8 119.818
9 119.841
10 119.849
11 119.960
12 120.211
13 119.911
14 120.026
15 119.938
Name: close, dtype: float64
#apply 將操做應用到每一列上 df2.apply(lambda x:x.sum()+1)
close 321.903 high 328.752 low 317.416 volume 5166066.460 code 9621089.000 dtype: float64
#apply 將操做應用到每一行上 df2.apply(lambda x:x.sum()+1,axis=1)
pandas之dataframe(下) 自定義函數 applymap(函數名),做用域DataFrame上,這個的函數的應用是針對於df裏的每一個位置去執行 apply(函數名),做用域DataFrame上,將操做應用於整列或者整行上(整行要修改axis=1) map做用於Series上 import numpy as np import pandas as pd df=pd.read_csv("d:/601318.csv") df id date open close high low volume code 0 0 2007/3/1 22.074 20.657 22.503 20.220 1977633.51 601318 1 1 2007/3/2 20.750 20.489 20.944 20.256 425048.32 601318 2 2 2007/3/5 20.300 19.593 20.384 19.218 419196.74 601318 3 3 2007/3/6 19.426 19.977 20.308 19.315 297727.88 601318 4 4 2007/3/7 19.995 20.520 20.706 19.827 287463.78 601318 5 5 2007/3/8 20.353 20.273 20.454 20.167 130983.83 601318 6 6 2007/3/9 20.264 20.101 20.353 19.735 160887.79 601318 7 7 2007/3/12 19.999 19.739 19.999 19.646 145353.06 601318 8 8 2007/3/13 19.783 19.818 19.982 19.699 102319.68 601318 9 9 2007/3/14 19.558 19.841 19.911 19.333 173306.56 601318 10 10 2007/3/15 20.097 19.849 20.525 19.779 152521.90 601318 11 11 2007/3/16 19.863 19.960 20.286 19.602 227547.24 601318 12 12 2007/3/20 20.662 20.211 20.715 20.088 222026.87 601318 13 13 2007/3/21 20.220 19.911 20.308 19.823 136728.32 601318 14 14 2007/3/22 20.066 20.026 20.273 19.969 167509.84 601318 15 15 2007/3/23 20.017 19.938 20.101 19.739 139810.14 601318 16 16 2007/3/26 19.955 20.282 20.397 19.946 223266.79 601318 17 17 2007/3/27 20.216 20.269 20.467 20.145 139338.19 601318 18 18 2007/3/28 20.264 20.565 20.706 20.123 258263.69 601318 19 19 2007/3/29 20.666 20.927 21.540 20.520 461986.18 601318 20 20 2007/3/30 20.732 20.772 21.134 20.626 144617.20 601318 21 21 2007/4/2 20.772 21.364 21.501 20.772 231445.03 601318 22 22 2007/4/3 21.377 21.284 21.527 21.147 132712.04 601318 23 23 2007/4/4 21.289 21.099 21.412 20.993 122454.69 601318 24 24 2007/4/5 21.103 21.156 21.191 20.838 122865.38 601318 25 25 2007/4/6 21.050 21.196 21.611 20.971 195208.52 601318 26 26 2007/4/9 21.231 22.785 22.909 21.059 462770.21 601318 27 27 2007/4/10 22.516 23.319 23.699 22.516 407823.90 601318 28 28 2007/4/11 23.346 23.637 24.361 23.222 243446.50 601318 29 29 2007/4/12 23.832 23.593 25.606 23.377 159270.43 601318 ... ... ... ... ... ... ... ... ... 2440 2440 2017/6/21 47.778 48.896 49.025 47.046 849757.00 601318 2441 2441 2017/6/22 48.669 48.609 49.925 48.520 1146464.00 601318 2442 2442 2017/6/23 48.708 49.183 49.361 48.263 873719.00 601318 2443 2443 2017/6/26 49.450 49.183 50.222 48.817 953192.00 601318 2444 2444 2017/6/27 49.163 49.381 49.411 48.402 780835.00 601318 2445 2445 2017/6/28 49.163 48.085 49.203 48.026 691322.00 601318 2446 2446 2017/6/29 48.273 49.420 49.510 47.858 753228.00 601318 2447 2447 2017/6/30 49.262 49.074 49.658 48.748 598630.00 601318 2448 2448 2017/7/3 49.262 48.411 49.262 48.026 563199.00 601318 2449 2449 2017/7/4 48.273 47.403 48.313 47.393 683920.00 601318 2450 2450 2017/7/5 47.482 49.876 50.152 47.482 1272537.00 601318 2451 2451 2017/7/6 49.876 50.835 51.438 49.529 1137814.00 601318 2452 2452 2017/7/7 50.598 50.459 51.063 49.984 533925.00 601318 2453 2453 2017/7/10 50.469 50.578 51.399 50.143 570776.00 601318 2454 2454 2017/7/11 50.810 51.230 52.010 50.610 699539.00 601318 2455 2455 2017/7/12 51.360 50.610 52.500 50.420 870117.00 601318 2456 2456 2017/7/13 50.980 51.630 51.860 50.830 665342.00 601318 2457 2457 2017/7/14 51.690 52.770 52.790 51.300 707791.00 601318 2458 2458 2017/7/17 53.010 53.900 55.090 52.420 1408791.00 601318 2459 2459 2017/7/18 53.600 53.470 54.260 52.510 879029.00 601318 2460 2460 2017/7/19 53.680 53.840 54.480 53.110 771180.00 601318 2461 2461 2017/7/20 53.550 54.010 54.150 52.820 659198.00 601318 2462 2462 2017/7/21 53.200 51.960 53.280 51.900 1294791.00 601318 2463 2463 2017/7/24 52.080 52.610 53.100 51.680 904595.00 601318 2464 2464 2017/7/25 52.620 52.310 53.050 52.180 506834.00 601318 2465 2465 2017/7/26 52.100 51.890 52.500 51.280 657610.00 601318 2466 2466 2017/7/27 51.850 52.360 52.740 51.090 667132.00 601318 2467 2467 2017/7/28 52.200 51.890 52.460 51.800 491294.00 601318 2468 2468 2017/7/31 51.880 52.020 52.640 51.410 616005.00 601318 2469 2469 2017/8/1 52.200 54.850 54.900 52.200 1147936.00 601318 2470 rows × 8 columns df2=df.loc[:15,"close":"code"] df2 close high low volume code 0 20.657 22.503 20.220 1977633.51 601318 1 20.489 20.944 20.256 425048.32 601318 2 19.593 20.384 19.218 419196.74 601318 3 19.977 20.308 19.315 297727.88 601318 4 20.520 20.706 19.827 287463.78 601318 5 20.273 20.454 20.167 130983.83 601318 6 20.101 20.353 19.735 160887.79 601318 7 19.739 19.999 19.646 145353.06 601318 8 19.818 19.982 19.699 102319.68 601318 9 19.841 19.911 19.333 173306.56 601318 10 19.849 20.525 19.779 152521.90 601318 11 19.960 20.286 19.602 227547.24 601318 12 20.211 20.715 20.088 222026.87 601318 13 19.911 20.308 19.823 136728.32 601318 14 20.026 20.273 19.969 167509.84 601318 15 19.938 20.101 19.739 139810.14 601318 #df2中每一個位置都是加10 df2.applymap(lambda x:x+10) close high low volume code 0 30.657 32.503 30.220 1977643.51 601328 1 30.489 30.944 30.256 425058.32 601328 2 29.593 30.384 29.218 419206.74 601328 3 29.977 30.308 29.315 297737.88 601328 4 30.520 30.706 29.827 287473.78 601328 5 30.273 30.454 30.167 130993.83 601328 6 30.101 30.353 29.735 160897.79 601328 7 29.739 29.999 29.646 145363.06 601328 8 29.818 29.982 29.699 102329.68 601328 9 29.841 29.911 29.333 173316.56 601328 10 29.849 30.525 29.779 152531.90 601328 11 29.960 30.286 29.602 227557.24 601328 12 30.211 30.715 30.088 222036.87 601328 13 29.911 30.308 29.823 136738.32 601328 14 30.026 30.273 29.969 167519.84 601328 15 29.938 30.101 29.739 139820.14 601328 # map做用域Series df4=df2["close"] df4.map(lambda x:x+100) 0 120.657 1 120.489 2 119.593 3 119.977 4 120.520 5 120.273 6 120.101 7 119.739 8 119.818 9 119.841 10 119.849 11 119.960 12 120.211 13 119.911 14 120.026 15 119.938 Name: close, dtype: float64 #apply 將操做應用到每一列上 df2.apply(lambda x:x.sum()+1) close 321.903 high 328.752 low 317.416 volume 5166066.460 code 9621089.000 dtype: float64 #apply 將操做應用到每一行上 df2.apply(lambda x:x.sum()+1,axis=1) 0 2579015.890 1 1026429.009 2 1020574.935 3 899106.480 4 888843.833 5 732363.724 6 762266.979 7 746731.444 8 703698.179 9 774684.645 10 753901.053 11 828926.088 12 823406.884 13 738107.362 14 768889.108 15 741188.918 dtype: float64
# 層次索引 內容更新中....
# 從文件讀取 - read_csv:默認分隔符是逗號 - read_table:默認分隔符是/t(tab鍵) 參數: - sep 執行分隔符 - header=None 指定文件無列名 - names 指定列名 - index_col 指定某列做爲索引 - skiprows 指定跳過哪一行 - na_values 指定某些字符串缺失值 - parse_dates 指定某些列是否被拆解爲日期,布爾值或列表 - nrows 指定讀取幾行文件 - chunksize 分塊讀取文件,指定快大小
# read_table 默認是以/t(tab)爲分割 pd.read_table("d:/new.csv")
pd.read_table("d:/new.csv",sep=",")
sep 還能夠是正則表達式,好比 sep="\s+",表示任意長度的空白字符
# 在讀取數據的時候,會默認將第一列指定爲列名,能夠經過修改header=None,指定第一行不是列名
pd.read_table("d:/new.csv",sep=",",header=None)
當設置header=None時,會自動取一個列名0,1,2,3,4,5,6,7
# 若是想本身取一個列名,能夠修改names pd.read_table("d:/new.csv",sep=",",header=None,names=["id","date","open","close","high","low","volumw","code"])
# 還能夠設置跳過哪些行 #完整的 pd.read_table("d:/new.csv",sep=",")
pd.read_table("d:/new.csv",sep=",",skiprows=[0])
從上邊能夠看出。它跳是從表格的第一行開始,索引爲0(在這裏第一行列名就是索引0的位置)
pd.read_table("d:/new.csv",sep=",",skiprows=[1])
# 在導入的時候,默認會生成行索引,若是咱們想使用某一列做爲行索引,可使用index_col,可使用多列["id","close"] df=pd.read_table("d:/new2.csv",sep=",",index_col=["id"]) df
df.loc[4:7,"close":"low"]
# 通常在實際場景中,咱們常常用用date做爲行索引 df=pd.read_table("d:/new2.csv",sep=",",index_col="date") df
type(df.index[0])
str
# 這裏的date是一個字符串,咱們能夠將這個date轉化爲一個時間類型:設置parse_dates df=pd.read_table("d:/new2.csv",sep=",",index_col="date",parse_dates=["date"]) type(df.index[0]) pandas._libs.tslib.Timestamp
在文件裏若是有nan這個字符(咱們以前講的是內存裏邊nan),如何去識別?
# 設置na_values # 凡是"nan","None","null","xxx"這樣的字符串都解析爲nan,不然整列都被解析爲字符串(記住,是整列,由於一列的數據類型必須一致) df=pd.read_table("d:/new3.csv",sep=",") df
df["id"][0] 'None' type(df["id"].iloc[1]) str df=pd.read_table("d:/new3.csv",sep=",",na_values=["nan","None","null","xxx"]) df
type(df["id"].iloc[1]) numpy.float64
# 寫入到文件 to_csv 主要參數: - sep 指定分隔符 - na_sep 指定缺失值轉換的字符串,默認爲空字符串 - header=False 不輸出第一行的列名 - index=False 不輸出行的索引一列 - columns 輸出指定列
# 默認是行名和列名都輸出,缺失值轉換的字符串轉換爲空 df.to_csv("d:/ceshi.csv",header=False,index=False,na_rep="DD",columns=["close"])
還能夠導出成其它的文件類型:json,xml,Html,數據庫
# 時間序列
# to_datetime 能夠將字符串轉換爲一種特定的時間類型 pd.to_datetime(df["date"])
0 2007-03-01
1 2007-03-02
2 2007-03-05
3 2007-03-06
4 2007-03-07
5 2007-03-08
6 2007-03-12
7 2007-03-13
8 2007-03-14
9 2007-03-15
10 2007-03-16
11 2007-03-20
12 2007-03-21
13 2007-03-22
Name: date, dtype: datetime64[ns]
時間處理對象:date_range
參數: - start 開始時間 - end 結束時間 - periods 時間長度 - freq 時間頻率,默認爲"D",可選H(our),W(wwk),B(usiness),M(onth),S(econd),A(year),T
# date_range 產生一組時間 pd.date_range("2017-06-01","2017-07-01")
DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04', '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23', '2017-06-24', '2017-06-25', '2017-06-26', '2017-06-27', '2017-06-28', '2017-06-29', '2017-06-30', '2017-07-01'], dtype='datetime64[ns]', freq='D')
# 假如要每一週出一天(默認是每一天出一個) # 這裏是星期日爲標準 pd.date_range("2017-06-01","2017-08-01",freq="W")
``` DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25', '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23', '2017-07-30'], dtype='datetime64[ns]', freq='W-SUN')
```python # 假如要只出工做日 pd.date_range("2017-06-01","2017-08-01",freq="B")
DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16', '2017-06-19', '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23', '2017-06-26', '2017-06-27', '2017-06-28', '2017-06-29', '2017-06-30', '2017-07-03', '2017-07-04', '2017-07-05', '2017-07-06', '2017-07-07', '2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13', '2017-07-14', '2017-07-17', '2017-07-18', '2017-07-19', '2017-07-20', '2017-07-21', '2017-07-24', '2017-07-25', '2017-07-26', '2017-07-27', '2017-07-28', '2017-07-31', '2017-08-01'], dtype='datetime64[ns]', freq='B')
# 半個月 pd.date_range("2017-06-01","2017-08-01",freq="SM") DatetimeIndex(['2017-06-15', '2017-06-30', '2017-07-15', '2017-07-31'], dtype='datetime64[ns]', freq='SM-15') # 一個月 pd.date_range("2017-06-01","2017-08-01",freq="M") DatetimeIndex(['2017-06-30', '2017-07-31'], dtype='datetime64[ns]', freq='M') # 分鐘 pd.date_range("2017-06-01","2017-08-01",freq="T")
DatetimeIndex(['2017-06-01 00:00:00', '2017-06-01 00:01:00', '2017-06-01 00:02:00', '2017-06-01 00:03:00', '2017-06-01 00:04:00', '2017-06-01 00:05:00', '2017-06-01 00:06:00', '2017-06-01 00:07:00', '2017-06-01 00:08:00', '2017-06-01 00:09:00', ... '2017-07-31 23:51:00', '2017-07-31 23:52:00', '2017-07-31 23:53:00', '2017-07-31 23:54:00', '2017-07-31 23:55:00', '2017-07-31 23:56:00', '2017-07-31 23:57:00', '2017-07-31 23:58:00', '2017-07-31 23:59:00', '2017-08-01 00:00:00'], dtype='datetime64[ns]', length=87841, freq='T')
# 年 pd.date_range("2017-06-01","2019-08-01",freq="A") DatetimeIndex(['2017-12-31', '2018-12-31'], dtype='datetime64[ns]', freq='A-DEC') # 星期一 pd.date_range("2017-06-01","2017-08-01",freq="W-MON") DatetimeIndex(['2017-06-05', '2017-06-12', '2017-06-19', '2017-06-26', '2017-07-03', '2017-07-10', '2017-07-17', '2017-07-24', '2017-07-31'], dtype='datetime64[ns]', freq='W-MON')
periods 指定時間長度
# 從2017-06-01開始,產生20天 pd.date_range("2017-06-01",periods=20)
DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04', '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20'], dtype='datetime64[ns]', freq='D')
# 從2017-06-01開始,產生20個周 pd.date_range("2017-06-01",periods=20,freq="W")
DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25', '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23', '2017-07-30', '2017-08-06', '2017-08-13', '2017-08-20', '2017-08-27', '2017-09-03', '2017-09-10', '2017-09-17', '2017-09-24', '2017-10-01', '2017-10-08', '2017-10-15'], dtype='datetime64[ns]', freq='W-SUN')
df=pd.read_csv("d:/601318.csv",index_col="date",parse_dates=["date"]) df
2470 rows × 7 columns
type(df.index)
pandas.core.indexes.datetimes.DatetimeIndex
能夠看到df.index的類型就是pd.date_range以後的類型:DatetimeIndex DatetimeIndex這個類型能夠在查找時很是方便
# 查找 2017年的數據 df["2017"]
141 rows × 7 columns
# 查找 2017年8月的數據 df["2017-8"]
# 查找 2017年6月到9月的數據 df["2017-06":"2017-09"]
這裏是按照時間對象索引(相似於標籤索引),顧前也顧尾
df[:10]
求出股票行情的前5日和前10日的平均值(這裏是close列的平均值)
import numpy as np import pandas as pd
df=pd.read_csv("d:/ceshi.csv",index_col="date",parse_dates=["date"]) df
2470 rows × 7 columns
方案1:手動計算
# 思路:拿出每一行前5行的"close"列的數據,再mean()求出平均值,賦值給列"ma5" df2=df[:10].copy() df2.loc["2007-03-07","ma5"]=df2["close"][:6].mean() df2.loc["2007-03"]
# 建立兩列,並初始化爲nan df["ma5"]=np.nan df["ma10"]=np.nan
df
2470 rows × 9 columns
# 使用for循環一個一個的去賦值 for i in range(4,len(df)): df.loc[df.index[i],"ma5"]=df["close"][i-4:i+1].mean() for i in range(9,len(df)): df.loc[df.index[i],"ma10"]=df["close"][i-9:i+1].mean() df
2470 rows × 9 columns