《利用python進行數據分析》之數據處理

In [1]:

import pandas as pd
import numpy as np

刪除缺失值¶

In [2]:

df = pd.DataFrame(np.random.randn(6,3))
df.iloc[:4,1] = np.NaN
df.iloc[:2,2] = np.NaN
df

Out[2]:

	0	1	2
0	0.230148	NaN	NaN
1	0.967690	NaN	NaN
2	0.296245	NaN	-0.905187
3	1.204742	NaN	1.765566
4	0.463110	1.423042	1.858276
5	1.272618	-0.033120	-1.269821

In [3]:

#有2個以上的nan才刪除
df.dropna(thresh=2)

Out[3]:

	0	1	2
2	0.296245	NaN	-0.905187
3	1.204742	NaN	1.765566
4	0.463110	1.423042	1.858276
5	1.272618	-0.033120	-1.269821

In [4]:

df.dropna(axis=1,thresh=3)

Out[4]:

	0	2
0	0.230148	NaN
1	0.967690	NaN
2	0.296245	-0.905187
3	1.204742	1.765566
4	0.463110	1.858276
5	1.272618	-1.269821

補全缺失值¶

In [5]:

df.fillna(method='bfill',limit=2)

Out[5]:

	0	1	2
0	0.230148	NaN	-0.905187
1	0.967690	NaN	-0.905187
2	0.296245	1.423042	-0.905187
3	1.204742	1.423042	1.765566
4	0.463110	1.423042	1.858276
5	1.272618	-0.033120	-1.269821

刪除重複值¶

In [6]:

df = pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})
df

Out[6]:

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

In [7]:

df.duplicated()

Out[7]:

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [8]:

df.drop_duplicates()

Out[8]:

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4

In [9]:

df.drop_duplicates(keep='last')

Out[9]:

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
6	two	4

In [10]:

df.drop_duplicates(['k2'])

Out[10]:

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

替代值¶

In [11]:

df.replace({'one':1,'two':2})

Out[11]:

	k1	k2
0	1	1
1	2	1
2	1	2
3	2	3
4	1	3
5	2	4
6	2	4

重命名索引¶

In [12]:

df.rename(index={6:'repeat'},columns={'k1':'one','k2':'two'})

Out[12]:

	one	two
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
repeat	two	4

離散化和分箱¶

In [13]:

ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats

Out[13]:

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [14]:

cats.codes #ages的數據標籤

Out[14]:

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [15]:

cats.categories

Out[15]:

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [16]:

pd.value_counts(cats)

Out[16]:

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [17]:

#改成左閉右開
pd.cut(ages,bins,right=False)

Out[17]:

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [18]:

#自定義箱名
pd.cut(ages,bins,labels=['Youth','YoungAdult','MiddleAged','Senior'])

Out[18]:

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [19]:

#定義箱子個數
data = np.random.rand(20)
pd.cut(data,4,precision=2)

Out[19]:

[(0.26, 0.5], (0.018, 0.26], (0.26, 0.5], (0.26, 0.5], (0.26, 0.5], ..., (0.5, 0.74], (0.5, 0.74], (0.74, 0.98], (0.018, 0.26], (0.74, 0.98]]
Length: 20
Categories (4, interval[float64]): [(0.018, 0.26] < (0.26, 0.5] < (0.5, 0.74] < (0.74, 0.98]]

In [20]:

#根據樣本的分位數進行分箱
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats

Out[20]:

[(-3.302, -0.702], (-3.302, -0.702], (-0.702, 0.00318], (0.713, 3.516], (-0.702, 0.00318], ..., (-0.702, 0.00318], (0.713, 3.516], (0.00318, 0.713], (0.00318, 0.713], (0.00318, 0.713]]
Length: 1000
Categories (4, interval[float64]): [(-3.302, -0.702] < (-0.702, 0.00318] < (0.00318, 0.713] < (0.713, 3.516]]

In [21]:

pd.value_counts(cats)

Out[21]:

(0.713, 3.516]       250
(0.00318, 0.713]     250
(-0.702, 0.00318]    250
(-3.302, -0.702]     250
dtype: int64

In [22]:

#自定義分位數
pd.qcut(data,[0,0.1,0.5,0.9,1.])

Out[22]:

[(-1.379, 0.00318], (-3.302, -1.379], (-1.379, 0.00318], (0.00318, 1.264], (-1.379, 0.00318], ..., (-1.379, 0.00318], (0.00318, 1.264], (0.00318, 1.264], (0.00318, 1.264], (0.00318, 1.264]]
Length: 1000
Categories (4, interval[float64]): [(-3.302, -1.379] < (-1.379, 0.00318] < (0.00318, 1.264] < (1.264, 3.516]]

檢測和過濾異常值¶

In [23]:

data = pd.DataFrame(np.random.randn(1000,4))
#擁有絕對值大於3的行
data[(np.abs(data)>3).any(1)]

Out[23]:

	0	1	2	3
55	-0.378616	0.023351	-0.390768	3.044924
73	2.223267	0.135439	-2.165250	3.190073
96	-0.442352	-0.241786	-3.061948	0.596391
199	-1.548635	-3.035343	0.090680	0.204924
351	3.095982	-1.234948	-1.707739	2.320149
376	-0.202145	-3.108550	0.066009	-0.571813
377	3.428047	-0.972084	-0.128787	0.898637
547	-2.163697	0.421286	-1.506587	-3.123146
638	-4.669923	0.938394	-0.719577	-1.615580
809	0.404150	-0.106034	-0.022086	3.192218
922	0.045469	0.224181	3.349797	-1.413839
972	0.438964	-0.577106	1.240444	3.074443

In [24]:

data.iloc[500] = [4,5,6,-7]
#絕對值都大於3的行
data[(np.abs(data)>3).all(1)]

Out[24]:

	0	1	2	3
500	4.0	5.0	6.0	-7.0

In [25]:

np.sign(data).head()

Out[25]:

	0	1	2	3
0	1.0	-1.0	1.0	1.0
1	-1.0	1.0	1.0	1.0
2	1.0	-1.0	1.0	1.0
3	1.0	-1.0	1.0	-1.0
4	-1.0	-1.0	1.0	1.0

隨機抽樣¶

In [26]:

data.sample(n=5)

Out[26]:

	0	1	2	3
508	0.945668	-1.054974	-0.485197	0.400421
68	1.679209	-0.545147	-0.329424	-0.140010
356	1.756743	0.583158	0.201458	-0.622671
760	1.532128	-0.677541	-0.095587	-0.489332
194	0.960248	-0.016510	0.116253	0.070162

類別統計¶

In [27]:

df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data':range(6)})
df

Out[27]:

	key	data
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

In [28]:

pd.get_dummies(df['key'])

Out[28]:

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

In [29]:

#一行屬於多個類別時，比較複雜

movies = pd.read_csv('datasets/movielens/movies.dat',sep='::',header= None,names=['movie_id','title','genres'])
movies.head()

/Users/finnchan/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  This is separate from the ipykernel package so we can avoid doing imports until

Out[29]:

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

In [30]:

all_genres = []
for genres in movies['genres']:
    all_genres.extend(genres.split('|'))
genres = pd.unique(all_genres)

In [31]:

genres

Out[31]:

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [32]:

count_df = pd.DataFrame(np.zeros(shape=(len(movies),len(genres))),columns=genres)
count_df.head()

Out[32]:

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

In [33]:

#遍歷分類能夠避免遍歷數據較多的movies，速度較快

for genre in genres:
    count_df.loc[movies['genres'].str.contains(genre),genre] = 1
count_df.head()

Out[33]:

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama
0	1.0	1.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0	0.0	1.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0

正則表達式¶

In [34]:

val = 'a,b,  guido'
val.index(',')

Out[34]:

In [35]:

#index找不到時會報錯，find找不到時返回-1
val.find(':')

Out[35]:

-1

In [36]:

import re
text = 'foo  bar\t baz  \tqux'
#正則表達式對象
regex = re.compile('\s+')
regex.findall(text)

Out[36]:

['  ', '\t ', '  \t']

In [37]:

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
#pattern里加上括號返回的是元組
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern=pattern,flags=re.IGNORECASE)
regex.findall(text)

Out[37]:

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [38]:

print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))

Dave Username:dave,Domain:google,Suffix:com
Steve Username:steve,Domain:gmail,Suffix:com
Rob Username:rob,Domain:gmail,Suffix:com
Ryan Username:ryan,Domain:yahoo,Suffix:com

數據合併¶

In [39]:

left = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left

Out[39]:

	key	value
0	a	0
1	b	1
2	a	2
3	a	3
4	b	4
5	c	5

In [40]:

right = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
right

Out[40]:

	group_val
a	3.5
b	7.0

In [41]:

#使用right的行索引做爲它的鏈接鍵
pd.merge(left,right,left_on='key',right_index=True)

Out[41]:

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

In [42]:

#不存在相同索引時，能夠考慮concat

df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two'])
df1

Out[42]:

	one	two
a	0	1
b	2	3
c	4	5

In [43]:

df2 = pd.DataFrame(np.arange(5,9).reshape(2,2),index=list('ac'),columns=['three','four'])
df2

Out[43]:

	three	four
a	5	6
c	7	8

In [44]:

pd.concat([df1,df2],keys=['df1','df2'],sort=True)

Out[44]:

		four	one	three	two
df1	a	NaN	0.0	NaN	1.0
	b	NaN	2.0	NaN	3.0
	c	NaN	4.0	NaN	5.0
df2	a	6.0	NaN	5.0	NaN
df2	c	8.0	NaN	7.0	NaN

In [45]:

pd.concat([df1,df2],axis=1,keys=['df1','df2'],sort=True)

Out[45]:

	df1		df2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

In [46]:

#不保留原來的索引，而是產生新的索引
pd.concat([df1,df2],ignore_index=True,sort=True)

Out[46]:

	four	one	three	two
0	NaN	0.0	NaN	1.0
1	NaN	2.0	NaN	3.0
2	NaN	4.0	NaN	5.0
3	6.0	NaN	5.0	NaN
4	8.0	NaN	7.0	NaN

In [47]:

df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df1

Out[47]:

	a	b	c
0	1.0	NaN	2
1	NaN	2.0	6
2	5.0	NaN	10
3	NaN	6.0	14

In [48]:

df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df2

Out[48]:

	a	b
0	5.0	NaN
1	4.0	3.0
2	NaN	4.0
3	3.0	6.0
4	7.0	8.0

In [49]:

#至關於np.where(pd.isnull(a),b,a)
df1.combine_first(df2)

Out[49]:

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

數據透視¶

In [50]:

data = pd.read_csv('examples/macrodata.csv')
data.head()

Out[50]:

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
0	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
2	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
3	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
4	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

In [51]:

periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods

Out[51]:

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

In [52]:

columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns

Out[52]:

Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')

In [53]:

data = data.reindex(columns=columns)
data.head()

Out[53]:

item	realgdp	infl	unemp
0	2710.349	0.00	5.8
1	2778.801	2.34	5.1
2	2775.488	2.74	5.3
3	2785.204	0.27	5.6
4	2847.699	2.31	5.2

In [54]:

data.index = periods.to_timestamp('D','e')

In [55]:

data.head()

Out[55]:

item	realgdp	infl	unemp
date
1959-03-31 23:59:59.999999999	2710.349	0.00	5.8
1959-06-30 23:59:59.999999999	2778.801	2.34	5.1
1959-09-30 23:59:59.999999999	2775.488	2.74	5.3
1959-12-31 23:59:59.999999999	2785.204	0.27	5.6
1960-03-31 23:59:59.999999999	2847.699	2.31	5.2

In [56]:

data = data.stack().reset_index().rename(columns={0:'value'})
data.head()

Out[56]:

	date	item	value
0	1959-03-31 23:59:59.999999999	realgdp	2710.349
1	1959-03-31 23:59:59.999999999	infl	0.000
2	1959-03-31 23:59:59.999999999	unemp	5.800
3	1959-06-30 23:59:59.999999999	realgdp	2778.801
4	1959-06-30 23:59:59.999999999	infl	2.340

In [57]:

data.pivot('date','item','value')[:5]#行，列，值

Out[57]:

item	infl	realgdp	unemp
date
1959-03-31 23:59:59.999999999	0.00	2710.349	5.8
1959-06-30 23:59:59.999999999	2.34	2778.801	5.1
1959-09-30 23:59:59.999999999	2.74	2775.488	5.3
1959-12-31 23:59:59.999999999	0.27	2785.204	5.6
1960-03-31 23:59:59.999999999	2.31	2847.699	5.2

In [58]:

df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Out[58]:

	key	A	B	C
0	foo	1	4	7
1	bar	2	5	8
2	baz	3	6	9

In [59]:

#pivot的反向操做
pd.melt(df,['key'])

Out[59]:

	key	variable	value
0	foo	A	1
1	bar	A	2
2	baz	A	3
3	foo	B	4
4	bar	B	5
5	baz	B	6
6	foo	C	7
7	bar	C	8
8	baz	C	9

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama
0	1.0	1.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0	0.0	1.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0

		four	one	three	two
df1	a	NaN	0.0	NaN	1.0
	b	NaN	2.0	NaN	3.0
	c	NaN	4.0	NaN	5.0
df2	a	6.0	NaN	5.0	NaN
df2	c	8.0	NaN	7.0	NaN

	four	one	three	two
0	NaN	0.0	NaN	1.0
1	NaN	2.0	NaN	3.0
2	NaN	4.0	NaN	5.0
3	6.0	NaN	5.0	NaN
4	8.0	NaN	7.0	NaN

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama
0	1.0	1.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0	0.0	1.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0

		four	one	three	two
df1	a	NaN	0.0	NaN	1.0
	b	NaN	2.0	NaN	3.0
	c	NaN	4.0	NaN	5.0
df2	a	6.0	NaN	5.0	NaN
df2	c	8.0	NaN	7.0	NaN

	four	one	three	two
0	NaN	0.0	NaN	1.0
1	NaN	2.0	NaN	3.0
2	NaN	4.0	NaN	5.0
3	6.0	NaN	5.0	NaN
4	8.0	NaN	7.0	NaN

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama
0	1.0	1.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0	0.0	1.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0

		four	one	three	two
df1	a	NaN	0.0	NaN	1.0
	b	NaN	2.0	NaN	3.0
	c	NaN	4.0	NaN	5.0
df2	a	6.0	NaN	5.0	NaN
df2	c	8.0	NaN	7.0	NaN

	four	one	three	two
0	NaN	0.0	NaN	1.0
1	NaN	2.0	NaN	3.0
2	NaN	4.0	NaN	5.0
3	6.0	NaN	5.0	NaN
4	8.0	NaN	7.0	NaN