數據分析——numpy

時間 2019-12-05

標籤數據分析 numpy 简体版

原文原文鏈接

DIKW

DATA-->INFOMATION-->KNOWLEDGE-->WISDOM數據庫

數據-->信息-->知識-->智慧數組

爬蟲-->數據庫-->數據分析-->機器學習app

信息：經過某種方式組織和處理數據，分析數據間的關係，數據就有了意義
知識：若是說數據是一個事實的集合，從中能夠得出關於事實的結論。那麼知識（Knowledge）就是信息的集合，它使信息變得有用。知識是對信息的應用，是一個對信息判斷和確認的過程，這個過程結合了經驗、上下文、詮釋和檢討。知識能夠回答「如何？」的問題，能夠幫助咱們建模和仿真
智慧：智慧能夠簡單的概括爲作正確判斷和決定的能力，包括對知識的最佳使用。智慧能夠回答「爲何」的問題。回到前面的例子，根據故障對客戶的業務影響能夠識別改進點

數學

微積分

1 # import math
2 # s = 0
3 # for i in range(1, 1001):
4 #     x = (math.pi / 1000) * i
5 #     y = math.sin((math.pi / 1000) * i)
6 #     s = (math.pi / 1000) * y + s
7 # print(s)

 1 # import numpy as np
 2 # def sin_integral(l,r,p):
 3 #     sum_result = 0
 4 #     delta = (r - l) / p
 5 #     for i in range(p):
 6 #         left = i * delta
 7 #         delta_area = delta * np.sin(left)
 8 #         sum_result += delta_area
 9 #     return sum_result
10 # print(sin_integral(0.0,np.pi,100000))

numpy

 1 # coding=utf-8
 2 import numpy as np
 3 import matplotlib.pyplot as pt
 4 
 5 # x的3次方
 6 # X = np.linspace(-100, 100, 100)
 7 # Y = X * X * X
 8 
 9 # tan
10 # X = np.linspace(-np.pi//2,np.pi//2,1000)
11 # Y = np.tan(X)
12 
13 # log
14 # X = np.linspace(-10,10,100)
15 # Y = np.log(X)
16 #
17 # pt.plot(X, Y)
18 # pt.show()
19 
20 # -----------------------------------------------------------------------
21 
22 # 雞兔同籠
23 # for x in range(36):
24 #     y = 35 - x
25 #     if x+2*y == 47:
26 #         print(x,y)
27 
28 # sinx面積0-pi
29 # import math
30 # s = 0
31 # for i in range(1, 1001):
32 #     x = (math.pi / 1000) * i
33 #     y = math.sin((math.pi / 1000) * i)
34 #     s = (math.pi / 1000) * y + s
35 # print(s)
36 # 承上封裝爲函數
37 # import numpy as np
38 # def sin_integral(l,r,p):
39 #     sum_result = 0
40 #     delta = (r - l) / p
41 #     for i in range(p):
42 #         left = i * delta
43 #         delta_area = delta * np.sin(left)
44 #         sum_result += delta_area
45 #     return sum_result
46 # print(sin_integral(0.0,np.pi,100000))
47 
48 # --------------------------------------------------------
49 # a = np.arange(18).reshape(3, 6)     #二維數組矩陣
50 a = np.arange(24).reshape(2,3,4)     #三維數組矩陣
51 # print a
52 # print a.ndim    #矩陣維數
53 # print np.ndim([[1,1],[2,2]])    #矩陣維數
54 # print a.dtype.name      #數值類型 int32
55 # print a.size    #元素個數
56 # print a.itemsize    #每一個數組元素的字節大小
57 # print type(a)         #a的類型
58 
59 b = np.array([[1.2, 2, 3], [4, 5, 6]])
60 # print b.dtype       #float64類型的數組
61 
62 c = np.array([[1, 1], [2, 2]], dtype=complex)
63 # print c,c.dtype     #複數類型complex128類型的數組
64 
65 z = np.zeros((3, 4))
66 # print z     #建立全零數組，默認爲float64形式
67 
68 o = np.ones((2, 3, 4), dtype=np.int16)
69 # print o
70 # 建立一個三維全1的數組，而且建立時指定類型，能夠認爲是一個長方體裏有序的充滿了1
71 #兩層，每一層是三行四列的二維數組
72 
73 e = np.empty((2,3))
74 # print e     #建立一個二維空數組，電腦不一樣顯示不一樣
75 
76 # f = np.arange(1,9,2)
77 f = np.arange(0,3,0.5)
78 # print f     #[1 3 5 7]，2和0.5爲步進值
79 
80 # print np.arange(10000)  #若是數組太大而沒法所有打印，NumPy會自動跳過中央部分，只能打印出邊界部分（首尾）
81 # np.set_printoptions(threshold='nan')      #禁用此省略並強制NumPy打印整個數組，使用set_printoptions更改打印選項
82 # print np.arange(10000).reshape(100,100)
83 
84 a = np.array([20,30,40,50])
85 b = np.arange(4)
86 # print a-b           #相減
87 # print b**2              #平方
88 # print 10*np.sin(a)      #a數組先進行sin運算，而後結果乘10

運算

階乘

np.math.factorial(100)

對數

np.log()

開方

1.準備每個條件的數據表示 2.準備程序的邏輯 3.將你的數據應用到邏輯 4.優化結構機器學習

1 # np.sqrt(3)    
2 
3 # A = (2, 7)
4 # B = (8, 3)  # 歐幾里得距離
5 # AB = np.sqrt((A[0] - B[0]) ** 2 + (A[1] - B[1]) ** 2)
6 # print AB

三角函數

np.arctan()
np.cos()
np.sin()
np.rad2deg()——弧度轉角度
np.deg2rad——角度轉弧度
……

1 # x = np.array([3, 0]) + np.array([0, 3])
2 # x = np.array([3,3])
3 # l = np.linalg.norm(x)   #矢量x的範數(長度)
4 # h = np.arctan(3.0/3.0)  #計算弧度 π/4
5 # j = np.rad2deg(h)       #弧度轉角度 45度
6 # np.deg2rad()          #角度轉弧度
7 # print j

點乘

numpy數組 (矢量) 默認的 +-*/ 操做都是對應位置的元素相操做ide

1 array1.dot(array2)
2 
3 # d1 = np.array([2, 7])
4 # d2 = np.array([8, 3])
5 # print d1.dot(d2)        #點乘(內積) 2*8+7*3 結果：實數

 1 # 餘弦類似度，向量內積，對應元素相乘再相加
 2 '''
 3 設兩個向量分別爲a=（x1，y1）,b=(x2,y2)，
 4 其夾角爲α，由於ab=|a||b|cosα，
 5 因此cosα=ab/|a||b|=（x1y1+x2,y2）/(根號（x1^2+y1^2）根號（x2^2+y1^2）)
 6 '''
 7 # d12 = d1.dot(d2)                    #d1·d2
 8 # d1_len = np.linalg.norm(d1)         #|d1|
 9 # d2_len = np.linalg.norm(d2)         #|d2|
10 # cosa = d12 / (d1_len * d2_len)      #餘弦值cosa
11 # a = np.rad2deg(np.arccos(cosa))     #角度a
12 # print a

複數

1 # a = 1 + 2j              #複數  complex
2 # b = 2 + 3j             #泰勒級數，傅里葉級數
3 # print a,type(a),a*b,a-b

1 # np.nan   #not a number 當數據讀取缺失或計算異常時會出現，本質是一個浮點數
2 # np.exp(10)  #以e爲底的指數
3 # np.log(10)    #以e爲底的對數，即ln
4 # np.e          #e,2.71828182
5 # np.inf          #無窮大

函數

空數組

默認值是0或正無窮或負無窮函數

實數在計算機裏只能用浮點數無限逼近精度，不能確切表示，因此在處理0的時候要格外當心；a - b < 0.1e-10 相減的時候當結果小於一個極小的數值就認爲相等學習

np.empty((3, 3))

數組

矢量是有方向和長度的變量，能夠用numpy的多位數組來表示，二維矢量就是平面的一個點優化

1 np.array([[1,2,3],[4,5,6]])

範數

矢量的範數(長度)spa

np.linalg.norm(np.array([3,3]))

類型轉換

1 array.astype(np.int)

數組信息

1 array.shape
2 array.shape[0]
3 array.shape[1]

1 # 使用兩個矢量相減，能夠計算兩點距離
2 d1 = np.array([2, 7])
3 # d2 = np.array([8,3])
4 # np.linalg.norm(d1-d2)
5 # d1.astype(np.int)    #將數組類型強制轉換爲int
6 # d1.shape    #返回數組的行列數
7 # d1.shape[0]     #返回數組的行數
8 # d1.shape[1]     #返回數組的列數

均分

# np.linspace()

 1 # xs = np.linspace(-1000, 1000, 10000)
 2 # idx = []
 3 # max_result = []
 4 # for x in xs:
 5 #     y = -3 * (x ** 2) + 5 * x - 6
 6 #     idx.append(x)
 7 #     max_result.append(y)
 8 # print max(max_result),idx[max_result.index(max(max_result))]
 9 
10 # def poly_test(l,r):
11 #     r_len = r - l
12 #     max_num = l
13 #     m_idx = l
14 #     for i in range(r_len):
15 #         r_num = l + i
16 #         result = (r_num ** 2) * -3 + (5 * r_num) - 6
17 #         if result > max_num:
18 #             max_num = result
19 #             m_idx = i
20 #     return max_num,m_idx
21 # print poly_test(-10000,10000)

1 # 在X軸上生成2000個從-10000到10000的離散點
2 # 使用矢量計算直接生成對應上述多項式的全部結果，這裏沒有使用循環，一次計算了20000個結果
3 # X = np.linspace(-1000, 10000, 20000)
4 # Y = (X ** 2) * -3 + 5 * X - 6  # 矢量運算，計算機會加速此類運算
5 # Y.max()  # 獲取當前矢量的最大值
6 # Y.argmax()  # 獲取當前數組最大值對應的索引(X值，不是函數中的X)

數組切片

二維數組

1 n_array = np.arange(25).reshape(5, 5)
2 # print n_array      #第一個數選行，第二個選列
3 # print n_array[:,:2]     #前兩列
4 # print n_array[:3,:]         #前三行
5 # print n_array[1:4,1:4]      #1-3行且1-3列
6 # print n_array[2,2]          #第3行的第3個數
7 # print n_array[2][2]         #同上
8 # print n_array[::-2]   #隔行選擇
9 # print n_array[::2]

三維數組

1 n3_array = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [3, 2, 1]], [[6, 5, 4], [9, 8, 7]]])
2 # print n3_array          #第一個數選層，第二個數選行，第三個數選列
3 # print n3_array[:,:,2]   #最後一列
4 # print n3_array[:,:1,:].sum()    #每一層的第一行
5 # print n3_array[:1,:,:].mean()       #平均值
6 # print n3_array[:,1,:2].std()        #方差小，更穩定

數組元素選取

1 # d1 < 3                           #返回知足條件的布爾類型矩陣
2 # np.count_nonzero(d1 < 3)        #統計數組中小於3的元素個數
3 # d1[d1<3]                         #選出指定範圍的元素

學生成績案例

數據準備

1 # score_array = np.loadtxt(open('score.csv', 'rb'), delimiter=',', dtype=int)
2 score_array = np.genfromtxt('score.csv', delimiter=',', dtype=int)
3 students = []
4 courses = ['數學', '語文', '化學', '地理', '音樂', '體育']

課程成績最好

 1 def course_score():
 2     course_score_max = 0
 3     cid_max = -1
 4     for c in range(6):
 5         course_score = score_array[:, c].sum()
 6         print course_score
 7         if course_score_max < course_score:
 8             course_score_max = course_score
 9             cid_max = c
10     return courses[cid_max], course_score_max

學生成績最好

 1 def student_score():
 2     student_score_max = 0
 3     sid_max = -1
 4     for s in range(6):
 5         student_score = score_array[s, :].sum()
 6         print '{}號學生成績：{}分'.format(s, student_score)
 7         if student_score_max < student_score:
 8             student_score_max = student_score
 9             sid_max = s
10     return '{}號學生成績最好，總分爲{}分'.format(sid_max, student_score_max)

學生偏科

 1 def pian():
 2     pian_max = 0
 3     pid_max = -1
 4     for p in range(6):
 5         student_score_std = score_array[p, :].std()
 6         print '{}號學生成績方差爲：{}'.format(p, student_score_std)
 7         if pian_max < student_score_std:
 8             pian_max = student_score_std
 9             pid_max = p
10     return '{}號學生偏科，方差爲：{}'.format(pid_max, pian_max)

主課成績最好

 1 def main_course_score():
 2     main_course_score_max = 0
 3     cid_max = -1
 4     for c in range(3):
 5         main_course_score = score_array[:, c].sum()
 6         print main_course_score
 7         if main_course_score_max < main_course_score:
 8             main_course_score_max = main_course_score
 9             cid_max = c
10     return cid_max, main_course_score_max

該班主課副課對比哪一個成績好

 1 def than():
 2     main_course_std = 0
 3     side_course_std = 0
 4     for t in range(3):
 5         main_course_std += score_array[:, t].std()
 6     main_course_std /= 3
 7     for t in range(3, 6):
 8         side_course_std += score_array[:, t].std()
 9     side_course_std /= 3
10     if main_course_std > side_course_std:
11         return '該班主課成績更好'
12     else:
13         return '該班副課成績更好'

這個班有多少學生出現了不及格

1 def bad():
2     badstudent = []
3     for b in range(6):
4         if min(score_array[b, :]) < 60:
5             badstudent.append(b)
6             # print '{}學生不及格'.format(b)
7     return '不及格學生：{}'.format(badstudent)

封裝成類

 1 name_dic = {0: '數學', 1: '語文', 2: '化學', 3: '地理', 4: '音樂', 5: '體育'}
 2 
 3 
 4 class CoursaDesc(object):
 5     def __init__(self):
 6         self.name = ''
 7         self.std = 0
 8         self.max = 0
 9         self.min = 0
10         self.mean = 0
11         self.num = 0
12 
13 
14 class ComputerDesc(object):
15     def __init__(self, n_array):
16         self.score_array = n_array
17         self.result = []
18 
19     def counter_all_coursa(self):
20         for i in range(6):
21             c_desc = CoursaDesc()
22             c_desc.name = name_dic[i]
23             c_desc.std = self.score_array[:, i].std()
24             c_desc.mean = self.score_array[:, i].mean()
25             c_desc.max = self.score_array[:, i].max()
26             c_desc.min = self.score_array[:, i].min()
27             c_desc.sum = self.score_array[:, i].sum()
28             self.result.append(c_desc)
29 
30     def best_coursa(self):
31         # std_list = [coursa.std for coursa in self.result]
32         # sum_list = [coursa.sum for coursa in self.result]
33         std_list = []
34         sum_list = []
35         for coursa in self.result:
36             std_list.append(coursa.std)
37             sum_list.append(coursa.sum)
38         std_array = np.array(std_list)
39         sum_array = np.array(sum_list)
40 
41         max_sum_coursa = sum_array.max()
42         max_sum_index = sum_array.argmax()
43 
44         min_std_coursa = std_array.min()
45         min_std_index = std_array.argmin()
46 
47         if max_sum_index == min_std_index:
48             return name_dic[max_sum_index]
49         else:
50             # 方差最小的課程的成績總和
51             min_std_coursa_sum = sum_array[min_std_index]
52             # 總和成績最大的課程的方差
53             max_sum_coursa_std = std_array[max_sum_index]
54 
55             sum_delta = max_sum_coursa - min_std_coursa_sum
56             std_delta = max_sum_coursa_std - min_std_coursa
57             sum_percent = sum_delta / max_sum_coursa
58             std_percent = std_delta / min_std_coursa
59             if sum_percent < 0.05 and std_percent > 0.2:
60                 return name_dic[min_std_index]
61 
62 if __name__ == '__main__':
63     c = ComputerDesc(score_array)
64     c.counter_all_coursa()
65     print c.best_coursa()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。