import pandas as pd import string #建立Series的兩種方式 #方式一 t = pd.Series([1,2,3,4,43],index=list('asdfg')) print(t) #方式二 temp_dict = {'name':'xiaohong','age':30,'tel':10086} t2 = pd.Series(temp_dict) print(t2) #字典推導式 a = {string.ascii_uppercase[i]:i for i in range(10)} print(a) print(pd.Series(a)) print(pd.Series(a,index=list(string.ascii_uppercase[5:15])))
import pandas as pd from pymongo import MongoClient #pandas讀取csv文件 # df = pd.read_csv('dogNames2.csv') # print(df) client = MongoClient() collection = client['meipai']['meipai_video'] data = collection.find() data_list = [] for i in data: temp = {} temp['cut_url'] = i['cut_url'] temp['create_time'] = i['create_time'] temp['title'] = i['title'] temp['video_url'] = i['video_url'] data_list.append(temp) # print(data) # t1 = data[0] # t1 = pd.Series(t1) # print(t1) df = pd.DataFrame(data_list) print(df.info()) print(df.describe()) # print(df.head()) # print('*'*100) # print(df.tail())
import pandas as pd temp_dict = {'name':['xiaohong','xiaozhang'],'age':[30,23],'tel':[10086,10010]} t1 = pd.DataFrame(temp_dict) print(t1) temp_dict1 = [{'name':'xiaohong','age':23,'tel':10086},{'name':'xiaogang','age':12},{'name':'xiaozhang','tel':10010}] t2 = pd.DataFrame(temp_dict1) print(t2)
import pandas as pd #pandas讀取csv文件 df = pd.read_csv('dogNames2.csv') # print(df.head()) # print(df.info()) #DataFrame中排序的方法 df = df.sort_values(by='Count_AnimalName',ascending=False) # print(df.head()) #pandas取行和列的注意事項 # - 方括號寫數組,表示取行,對行進行操做 # - 寫字符串,表示取列索引,對列進行操做 print(df[:20]) print(df[:20]['Row_Labels']) print(type(df['Row_Labels'])) #bool索引 print(df[(df['Row_Labels'].str.len()>4)&(df['Count_AnimalName']>800)])
import pandas as pd from matplotlib import pyplot as plt file_path = './IMDB-Movie-data.csv' df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # rating,runtime分佈狀況 # 選擇圖形:直方圖 # 準備數據 runtime_data = df['Runtime (Minutes)'].values max_runtime = runtime_data.max() min_runtime = runtime_data.min() #計算組距 num_bin = (max_runtime-min_runtime)//5 #設置圖行大小 plt.figure(figsize=(13,6),dpi=80) #畫直方圖 plt.hist(runtime_data,num_bin) plt.xticks(range(min_runtime,max_runtime+5,5)) #顯示 plt.show()
電影案例二python
import pandas as pd from matplotlib import pyplot as plt from functools import reduce file_path = './IMDB-Movie-data.csv' df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # rating,runtime分佈狀況 # 選擇圖形:直方圖 # 準備數據 # runtime_data = df['Runtime (Minutes)'].values rate_data = df['Rating'].values max_rate = rate_data.max() min_rate = rate_data.min() #設置不等寬組距,hist方法中取到的會是一個左閉右開的區間[1,9,3.5) num_bin_list = [1.9,3.5] i = 3.5 while i<=max_rate: i += 0.5 num_bin_list.append(i) print(num_bin_list) #設置圖形大小 plt.figure(figsize=(13,6),dpi=80) #畫直方圖 plt.hist(rate_data,num_bin_list) #xticks讓以前的組距可以對上 plt.xticks(num_bin_list) #顯示 plt.show()
[1.9, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]
import numpy import pandas as pd df = pd.read_csv('IMDB-Movie-Data.csv') print(df.info()) print(df.describe()) #獲取評分的均分 rate_mean = df.Rating.mean() print(rate_mean) #獲取導演的人數 print(df.Director.value_counts().count()) print(len(set(df.Director.tolist()))) print(len(df.Director.unique())) #獲取演員的人數 temp_actors_list = df.Actors.str.split(',').tolist() actors_list = [i for j in temp_actors_list for i in j] # numpy.array(temp_actors_list).flatten() actors_num = len(set(actors_list)) print(actors_num) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 12 columns): Rank 1000 non-null int64 Title 1000 non-null object Genre 1000 non-null object Description 1000 non-null object Director 1000 non-null object Actors 1000 non-null object Year 1000 non-null int64 Runtime (Minutes) 1000 non-null int64 Rating 1000 non-null float64 Votes 1000 non-null int64 Revenue (Millions) 872 non-null float64 Metascore 936 non-null float64 dtypes: float64(3), int64(4), object(5) memory usage: 93.8+ KB None Rank Year ... Revenue (Millions) Metascore count 1000.000000 1000.000000 ... 872.000000 936.000000 mean 500.500000 2012.783000 ... 82.956376 58.985043 std 288.819436 3.205962 ... 103.253540 17.194757 min 1.000000 2006.000000 ... 0.000000 11.000000 25% 250.750000 2010.000000 ... 13.270000 47.000000 50% 500.500000 2014.000000 ... 47.985000 59.500000 75% 750.250000 2016.000000 ... 113.715000 72.000000 max 1000.000000 2016.000000 ... 936.630000 100.000000 [8 rows x 7 columns]
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 對於這一組電影數據,若是咱們但願統計電影分類(genre)的狀況,應該如何處理數據? 思路:從新構造一個全爲0的數組,列名爲分類,若是某一條數據中分類出現過,就讓0變爲1 """ import numpy as np import pandas as pd from matplotlib import pyplot as plt from matplotlib import font_manager #中文字體 my_font = font_manager.FontProperties(family='SimHei') #顯示完整的列 pd.set_option('display.max_columns', None) df = pd.read_csv('IMDB-Movie-Data.csv') #統計分類列表 temp_list = df.Genre.str.split(',').tolist() genre_list = list(set([i for j in temp_list for i in j])) #構造全爲0的數組 zero_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list) # print(zero_df) #給每一個電影出現分類的位置賦值1 for i in range(df.shape[0]): zero_df.loc[i,temp_list[i]] = 1 # print(zero_df.head(1)) genre_count = zero_df.sum(axis=0) print(genre_count) #排序 genre_count = genre_count.sort_values() _x = genre_count.index _y = genre_count.values #畫圖 plt.figure(figsize=(15,6),dpi=80) plt.bar(range(len(_x)),_y,width=0.4,color="orange") plt.xticks(range(len(_x)),_x) plt.title('電影分類統計圖',fontproperties=my_font) plt.show()
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 如今咱們有一組關於全球星巴克店鋪的統計數據,若是我想知道美國的星巴克數量和中國的哪一個多,或者我想知道中國每一個省份星巴克的數量的狀況,那麼應該怎麼辦? 思路:遍歷一遍,每次加1 ??? """ import pandas as pd pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') # print(df.head(1)) # print(df.info()) grouped = df.groupby(by='Country') # print(grouped) # DataFrameGroupBy # 能夠進行遍歷 # for i,j in grouped: # print(i) # print('-'*100) # print(j) # print('*'*100) country_count = grouped['Brand'].count() # print(country_count['US']) # print(country_count['CN']) #統計中國每一個省份店鋪的數量 china_data = df[df.Country == 'CN'] china_grouped = china_data.groupby(by='State/Province').count()['Brand'] # print(china_grouped) #數據按照多個條件進行分組 brand_grouped = df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count() # print(brand_grouped) # print(type(brand_grouped)) #數據按照多個條件進行分組,返回dataframe brand_grouped1 = df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count() brand_grouped2 = df.groupby(by=[df['Country'],df['State/Province']])[['Brand']].count() brand_grouped3 = df.groupby(by=[df['Country'],df['State/Province']]).count()[['Brand']] # print(brand_grouped1) # print(brand_grouped2) # print(brand_grouped3) #索引的方法和屬性 print(brand_grouped1) print(brand_grouped1.index)
import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') df = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10] _x = df.index _y = df.values #畫圖 plt.figure(figsize=(13,6),dpi=80) plt.bar(_x,_y) plt.show()
分組聚合二數組
import pandas as pd from matplotlib import pyplot as plt from matplotlib import font_manager my_font = font_manager.FontProperties(family='SimHei') pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') df = df[df['Country']=='CN'] print(df.head(1)) df = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25] _x = df.index _y = df.values #畫圖 plt.figure(figsize=(13,6),dpi=80) # plt.bar(_x,_y,width=0.3,color='orange') plt.barh(_x,_y,height=0.3,color='orange') # plt.xticks(_x,fontproperties=my_font) plt.yticks(_x,fontproperties=my_font) plt.show()
import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns', None) df = pd.read_csv('books.csv') # print(df.info()) data = df[pd.notnull(df['original_publication_year'])] grouped = data.groupby(by='original_publication_year').count()['title'] # print(grouped) grouped1 = data.average_rating.groupby(by=data['original_publication_year']).mean() # print(grouped1) _x = grouped1.index _y = grouped1.values plt.figure(figsize=(15,6),dpi=80) plt.plot(range(len(_x)),_y) plt.xticks(range(len(_x))[::10],_x[::10].astype(int),rotation=45) plt.show()
import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') # print(df.head(1)) # print(df.info()) #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = list(set([i[0] for i in temp_list])) # print(cate_list) #構造全爲0的數組 zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list) #賦值 for cate in cate_list: zeros_df[cate][df.title.str.contains(cate)] = 1 print(zeros_df) sum_ret = zeros_df.sum(axis=0) print(sum_ret)
示例二app
import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') # print(df.head(1)) # print(df.info()) #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = [i[0] for i in temp_list] df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1)) print(df.groupby(by='cate').count()['title'])
實例一ide
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 統計出911數據中不一樣月份電話次數的變化狀況 """ import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') df.drop_duplicates() df.timeStamp = pd.to_datetime(df.timeStamp) #時間字符串轉時間格式 df.set_index('timeStamp',inplace=True) #設置時間格式爲索引 # print(df.head()) #統計出911數據中不一樣月份電話次數 count_by_month = df.resample('M').count()['title'] print(count_by_month) #畫圖 _x = count_by_month.index _y = count_by_month.values plt.figure(figsize=(15,8),dpi=80) plt.plot(range(len(_x)),_y) plt.xticks(range(len(_x)),_x.strftime('%Y-%m-%d'),rotation=45) plt.show()
實例二字體
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 統計出911數據中不一樣月份不一樣類型的電話的次數的變化狀況 """ import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') #把時間字符串轉化爲時間類型設置爲索引 df.timeStamp = pd.to_datetime(df.timeStamp) #添加列,表示分類 temp_list = df.title.str.split(':').tolist() cate_list = [i[0] for i in temp_list] df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1)) df.set_index('timeStamp',inplace=True) plt.figure(figsize=(15, 8), dpi=80) #分組 for group_name,group_data in df.groupby(by='cate'): #對不一樣的分類都進行繪圖 count_by_month = group_data.resample('M').count()['title'] # 畫圖 _x = count_by_month.index _y = count_by_month.values plt.plot(range(len(_x)),_y,label=group_name) plt.xticks(range(len(_x)), _x.strftime('%Y-%m-%d'), rotation=45) plt.legend(loc='best') plt.show()
實例三:pm2.5url
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 繪製美國和中國PM2.5隨時間的變化狀況 """ import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns',None) df = pd.read_csv('PM2.5/BeijingPM20100101_20151231.csv') # print(df.head()) #把分開的時間字符串經過periodIndex的方法轉化爲pandas的時間類型 period = pd.PeriodIndex(year=df.year,month=df.month,day=df.day,hour=df.hour,freq='H') df['datetime'] = period print(df.head(10)) #把datetime設置爲索引 df.set_index('datetime',inplace=True) #進行降採樣 df = df.resample('7D').mean() #處理缺失值,刪除缺失數據 # data = df['PM_US Post'].dropna() # china_data = df['PM_Nongzhanguan'].dropna() data = df['PM_US Post'] china_data = df['PM_Nongzhanguan'] #畫圖 _x = data.index _y = data.values _x_china = china_data.index _y_china = china_data.values plt.figure(figsize=(13,8),dpi=80) plt.plot(range(len(_x)),_y,label='US_POST',alpha=0.7) plt.plot(range(len(_x_china)),_y_china,label='CN_POST',alpha=0.7) plt.xticks(range(0,len(_x_china),10),list(_x_china.strftime('%Y%m%d'))[::10],rotation=45) plt.show()