Python
數據分析:讓你像寫 Sql
語句同樣,使用 Pandas
作數據分析import pandas as pd import numpy as np url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) output = tips.head()
Output:html
total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 2 21.01 3.50 Male No Sun Dinner 3 3 23.68 3.31 Male No Sun Dinner 2 4 24.59 3.61 Female No Sun Dinner 4
sql 語句: SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;
。python
output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
Output:git
total_bill tip smoker time 0 16.99 1.01 No Dinner 1 10.34 1.66 No Dinner 2 21.01 3.50 No Dinner 3 23.68 3.31 No Dinner 4 24.59 3.61 No Dinner
sql 語句: SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;
github
output = tips[tips['time'] == 'Dinner'].head(5) # 或者 output = tips.query("time == 'Dinner'").head(5)
Output:sql
total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 2 21.01 3.50 Male No Sun Dinner 3 3 23.68 3.31 Male No Sun Dinner 2 4 24.59 3.61 Female No Sun Dinner 4
==
、 大於 >
、 大於等於 >=
、小於等於 <=
、不等於 !=
==
sql 語句:SELECT * FROM tips WHERE time = 'Dinner';
。dom
output = tips[(tips['time'] == 'Dinner')]
>
sql 語句:SELECT * FROM tips WHERE tip > 5.00;
。url
output = tips[(tips['tip'] > 5.00)]
>=
sql 語句:SELECT * FROM tips WHERE tip >= 5.00;
。code
output = tips[(tips['size'] >= 5)]
<=
sql 語句:SELECT * FROM tips WHERE tip <= 5.00;
。htm
output = tips[(tips['size'] <= 5)]
!=
sql 語句:SELECT * FROM tips WHERE tip <> 5.00;
。blog
output = tips[(tips['size'] != 5)]
&
、或 |
、非 -
&
sql 語句:SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;
output = tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
|
sql 語句:SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;
。
output = tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
-
sql 語句:SELECT * FROM tips WHERE not (size <> 5 AND size > 4);
output = df[-((df['size'] != 5) & (df['size'] > 4))]
這裏從新定義一個包含 NaN
數據的 DataFrame
。
frame = pd.DataFrame({ 'col1': ['A', 'B', np.NaN, 'C', 'D'], 'col2': ['F', np.NaN, 'G', 'H', 'I'] }) output = frame
Output:
col1 col2 0 A F 1 B NaN 2 NaN G 3 C H 4 D I
sql 語句:SELECT * FROM frame WHERE col2 IS NULL;
。
output = frame[frame['col2'].isna()]
Output:
col1 col2 1 B NaN
sql 語句:SELECT * FROM frame WHERE col1 IS NOT NULL;
。
output = frame[frame['col1'].notna()]
Output:
col1 col2 0 A F 1 B NaN 3 C H 4 D I
sql 語句:SELECT * FROM tips WHERE siez in (5, 6);
。
output = tips[tips['size'].isin([2, 5])]
sql 語句:SELECT * FROM tips WHERE time like 'Din%';
。
output = tips[tips.time.str.contains('Din*')]
sql 語句:SELECT sex, count(*) FROM tips GROUP BY sex;
output = tips.groupby('sex').size() # 獲取相應的結果 output['Male'] output['Female']
output = tips.groupby('sex').count() # 獲取相應的結果 output['tip']['Female']
output = tips.groupby('sex')['total_bill'].count() # 獲取相應的結果 output['Male'] output['Female']
sql 語句:SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
output = tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) # 獲取相應的結果 output['day']['Fri'] output['tip']['Fri']
sql 語句:SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
output = tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) # 獲取相應的結果 output['tip']['size']['No']['Fri']
sql 語句:SELECT tip, count(distinct sex) FROM tips GROUP BY tip;
output = tips.groupby('tip').agg({'sex': pd.Series.nunique})
定義兩個 DataFrame。
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'value': np.random.randn(4)}) df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], 'value': np.random.randn(4)})
sql 語句:SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key') # 或 indexed_df2 = df2.set_index('key') pd.merge(df1, indexed_df2, left_on='key', right_index=True)
sql 語句:SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key', how='left') # 或 output = df1.join(df2, on='key', how='left')
sql 語句:SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key', how='right')
sql 語句:SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key', how='outer')
df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], 'rank': range(1, 4)}) df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], 'rank': [1, 4, 5]})
sql 語句:SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
output = pd.concat([df1, df2])
sql 語句:SELECT city, rank FROM df1 UNION SELECT city, rank FROM df2;
output = pd.concat([df1, df2]).drop_duplicates()
sql 語句:SELECT DISTINCT sex FROM tips;
output = tips.drop_duplicates(subset=['sex'], keep='first', inplace=False)
sql 語句:SELECT total_bill AS total, sex AS xes FROM tips;
output = tips.rename(columns={'total_bill': 'total', 'sex': 'xes'}, inplace=False)
sql 語句:SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
output = tips.nlargest(10 + 5, columns='tip').tail(10)
sql 語句:
SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn;
output = tips.assign(rn=tips.sort_values(['total_bill'], ascending=False).\ groupby(['day']).cumcount() + 1).\ query('rn < 3').\ sort_values(['day', 'rn'])
sql 語句:UPDATE tips SET tip = tip*2 WHERE tip < 2;
output = tips.loc[tips['tip'] < 2, 'tip'] *= 2
sql 語句:DELETE FROM tips WHERE tip > 9;
output = tips = tips.loc[tips['tip'] <= 9]