#The Optimization of the Adaboost ###1.對於Adaboost error function的推導 再回到咱們上篇文章講到的Adaboost算法,咱們要從Adaboost算法推導出GBDT。首先回顧一下上篇文章的Adaboost,主要思想就是把弱分類器集中起來獲得一個強的分類器。首先第一次建造樹的時候每個樣本的權值都是同樣的,以後的每一次訓練只要有錯誤,那麼這個錯誤就會被放大,而正確的權值就會被縮小,以後會獲得每個模型的α,根據每個樹的α把結果結合起來就獲得須要的結果。 node
def loadDataSet(filename):
''' load dataSet :param filename: the filename which you need to open :return: dataset in file '''
dataMat = pd.read_csv(filename)
for i in range(np.shape(dataMat)[0]):
if dataMat.iloc[i, 2] == 0:
dataMat.iloc[i, 2] = -1
return dataMat
pass
def split_data(data_array, col, value):
'''split the data according to the feature'''
array_1 = data_array.loc[data_array.iloc[:, col] >= value, :]
array_2 = data_array.loc[data_array.iloc[:, col] < value, :]
return array_1, array_2
pass
def getErr(data_array):
'''calculate the var '''return np.var(data_array.iloc[:, -1]) * data_array.shape[0]
pass
def regLeaf(data_array):
return np.mean(data_array.iloc[:, -1])
複製代碼
加載數據,分割數據,計算方差,計算葉子平均,其實就是計算擬合的類別了。
def get_best_split(data_array, ops = (1, 4)):
'''the best point to split data'''
tols = ops[0]
toln = ops[1]
if len(set(data_array.iloc[:, -1])) == 1:
return None, regLeaf(data_array)
m, n = data_array.shape
best_S = np.inf
best_col = 0
best_value = 0
S = getErr(data_array)
for col in range(n - 1):
values = set(data_array.iloc[:, col])
for value in values:
array_1, array_2 = split_data(data_array, col, value)
if (array_1.shape[0] < toln) or (array_2.shape[0] < toln):
continue
totalError = getErr(array_1) + getErr(array_2)
if totalError< best_S:
best_col = col
best_value = value
best_S = totalError
if (S - best_S) < tols:
return None, regLeaf(data_array)
array_1, array_2 = split_data(data_array, best_col, best_value)
if (array_1.shape[0] < toln) or (array_2.shape[0] < toln):
return None, regLeaf(data_array)
return best_col, best_value
複製代碼
def treeCast(tree, inData):
'''get the classification'''if tree.results != None:
return tree.results
ifinData.iloc[tree.col] > tree.value:
return treeCast(tree.gb, inData)
else:
return treeCast(tree.lb, inData)
pass
def createForeCast(tree, testData):
m = len(testData)
yHat = np.mat(np.zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeCast(tree, testData.iloc[i])
return yHat
複製代碼
建立分類。
def GBDT_model(data_array, num_iter, ops = (1, 4)):
m, n = data_array.shape
x = data_array.iloc[:, 0:-1]
y = data_array.iloc[:, -1]
y = np.mat(y).T
list_trees = []
yHat = None
for i in range(num_iter):
print('the ', i, ' tree')
if i == 0:
tree = buildTree(data_array, ops)
list_trees.append(tree)
yHat = createForeCast(tree, x)
else:
r = y - yHat
data_array = np.hstack((x, r))
data_array = pd.DataFrame(data_array)
tree = buildTree(data_array, ops)
list_trees.append(tree)
rHat = createForeCast(tree, x)
yHat = yHat + rHat
return list_trees, yHat
複製代碼
這裏只是使用了迴歸問題的迴歸樹,x和(y - s)作擬合以後加入預測集便可。 接下來就是畫圖了:
def getwidth(tree):
if tree.gb == None and tree.lb == None: return 1
return getwidth(tree.gb) + getwidth(tree.lb)
def getdepth(tree):
if tree.gb == None and tree.lb == None: return 0
return max(getdepth(tree.gb), getdepth(tree.lb)) + 1
def drawtree(tree, jpeg='tree.jpg'):
w = getwidth(tree) * 100
h = getdepth(tree) * 100 + 120
img = Image.new('RGB', (w, h), (255, 255, 255))
draw = ImageDraw.Draw(img)
drawnode(draw, tree, w / 2, 20)
img.save(jpeg, 'JPEG')
def drawnode(draw, tree, x, y):
if tree.results == None:
# Get the width of each branch
w1 = getwidth(tree.lb) * 100
w2 = getwidth(tree.gb) * 100
# Determine the total space required by this node
left = x - (w1 + w2) / 2
right = x + (w1 + w2) / 2
# Draw the condition string
draw.text((x - 20, y - 10), str(tree.col) + ':' + str(tree.value), (0, 0, 0))
# Draw links to the branches
draw.line((x, y, left + w1 / 2, y + 100), fill=(255, 0, 0))
draw.line((x, y, right - w2 / 2, y + 100), fill=(255, 0, 0))
# Draw the branch nodes
drawnode(draw, tree.lb, left + w1 / 2, y + 100)
drawnode(draw, tree.gb, right - w2 / 2, y + 100)
else:
txt = str(tree.results)
draw.text((x - 20, y), txt, (0, 0, 0))
複製代碼
以後就是運行主函數了:
if __name__ == '__main__':
data = loadDataSet('../Data/LogiReg_data.txt')
tree = buildTree(data)
drawtree(tree, jpeg='treeview_cart.jpg')
gbdt_results, y = GBDT_model(data, 10)
print(y)
for i in range(len(y)):
if y[i] > 0:
print('1')
elif y[i] < 0:
print('0')
複製代碼