import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
temp_data = pd.read_csv("./IMDB-Movie-Data.csv")
data = temp_data["Runtime (Minutes)"].values
num_bins = (data.max()-data.min())//5
x_ticks = range(data.min(),data.max()+5,5)
plt.figure(figsize=(20,8),dpi=80)
plt.hist(data,num_bins)
plt.xticks(x_ticks)
plt.grid(alpha=0.4,linestyle="--")
plt.show()
#嘗試一下將頭尾的數據結合到一塊兒
num_bins_2 = [66] + list(range(81,171,5))+[192] #能夠把這裏的bins理解爲刻度,最後取到192是爲了把最大值191包含進去,左閉右開
plt.figure(figsize=(20,8),dpi=80)
plt.hist(data,num_bins_2)
plt.xticks(num_bins_2)
plt.grid(alpha=0.6,linestyle="--")
plt.show()
temp_list = temp_data["Genre"].str.split(",")
genre_set = set([i for j in temp_list for i in j])
#構造全爲0的,以類名爲列索引的DataFrame
count_frame = pd.DataFrame(np.zeros(shape=(temp_list.shape[0],len(genre_set))),columns=genre_set)
#start = time.process_time()
#for i in range(temp_list.shape[0]):
# count_frame.loc[i,temp_list[i]] = 1
#end = time.process_time()
#print(end-start)
#這種方法性能更高,不須要遍歷
#start = time.process_time()
for genre in genre_set:
count_frame.loc[temp_data["Genre"].str.contains(genre),genre] = 1
#end = time.process_time()
#print(end-start)
sum_frame = count_frame.sum()
sum_frame = sum_frame.sort_values()
plt.figure(figsize=(20,8),dpi=80)
plt.bar(sum_frame.index,sum_frame.values,width=0.6,color='orange')
plt.show()
read_data = pd.read_csv("./starbucks_store_worldwide.csv")
group_data = read_data.groupby(by="Country")
#group_data爲DataFrameGroupBy類型,取出來的groupData爲DataFrame類型
#for groupName,groupData in group_data:
# print(groupName,"\n",type(groupData),"\n",groupData,"\n"+"*"*100)
count_group_data = group_data.count()#count_group_data爲DataFrame類型
count_group_data = count_group_data["Brand"]#此時Country列已經變爲行索引了,隨便取出數據完整的一列,series類型
print(count_group_data.head())
#多條件分組
#中國各省份的店鋪數量
group_data_1 = read_data.groupby(by=["Country","State/Province"])#條件順序分前後
count_group_data_1 = group_data_1.count()#複合索引的dataframe
data = count_group_data_1["Brand"]["CN"]
plt.figure(figsize=(20,8),dpi=80)
plt.bar(data.index,data.values,color="orange")
plt.show()
#店鋪數量排前十的國家
data = read_data.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]
plt.figure(figsize=(20,8),dpi=80)
plt.bar(data.index,data.values,color="orange",width=0.5)
plt.show()