def getTestWordClassId(wordJson, words, text):
""" 獲取測試文本所屬類別 :param wordJson: 單詞在類型中出現機率 :param words: 測試文本 :return: """
p = 0
classId = 1
for i in wordJson:
num = 0
for j in words:
if j in wordJson[i]:
num += wordJson[i][j]['pab']
iffloat(num) > float(p):
p = num
classId = i
return int(classId)
if __name__ == "__main__":
newDataSet = []
for i in dataSet:
newDataSet.append(segment_and_stop_word(i))
vocabList = createVocabList(newDataSet)
# 3. 單詞在類型中出現機率
wordJson = {}
classListSet = set(listClasses)
for i in classListSet:
wordJson[i] = setOfWords2Vec(i, vocabList, newDataSet, listClasses)
testDataSet = newDataSet # 測試集
classResult = {}
trainTextAndIds = []
for i in range(len(dataSet)):
trainTextAndIds.append({
"id": i,
"text": dataSet[i],
})
for i in range(len(testDataSet)):
classResult[trainTextAndIds[i]['id']] = {
"train_id": listClasses[i],
"test_id": getTestWordClassId(wordJson, testDataSet[i], trainTextAndIds[i]['text']),
"id": trainTextAndIds[i]['id'],
"text": trainTextAndIds[i]['text']
}
複製代碼
計算召回率正確率
# 計算準確率召回率
def getRate(dataSet, classVec):
rates = {}
for i in classVec:
rates[i] = {
'TP': 0, 'FN': 0, 'FP': 0, 'TN': 0
}
for i in dataSet:
if dataSet[i]['train_id'] == dataSet[i]['test_id']:
rates[dataSet[i]['train_id']]['TP'] += 1 # TP: 將正類預測爲正類數else:
rates[dataSet[i]['train_id']]['FN'] += 1 # FN: 將正類預測爲負類數
rates[dataSet[i]['test_id']]['FP'] += 1 # FP: 將負類預測爲正類數for i in rates:
rates[i]['TN'] = len(dataSet) - rates[i]['TP'] - rates[i]['FP'] - rates[i]['FN'] # TN: 將負類預測爲負類數
accuracy_recall_list = []
for i in rates:
row = rates[i]
_row = {
"tag_id": i,
"recall": 0.0 if row['TP'] + row['FN'] == 0 else round(row['TP'] / (row['TP'] + row['FN']), 4),
"accuracy": round((row['TP'] + row['TN']) / (row['TP'] + row['FP'] + row['TN'] + row['FN']), 4),
"row": row
}
accuracy_recall_list.append(_row)
return accuracy_recall_list
if __name__ == "__main__":
newDataSet = []
for i in dataSet:
newDataSet.append(segment_and_stop_word(i))
vocabList = createVocabList(newDataSet)
# 3. 單詞在類型中出現機率
wordJson = {}
classListSet = set(listClasses)
for i in classListSet:
wordJson[i] = setOfWords2Vec(i, vocabList, newDataSet, listClasses)
testDataSet = newDataSet # 測試集
classResult = {}
trainTextAndIds = []
for i in range(len(dataSet)):
trainTextAndIds.append({
"id": i,
"text": dataSet[i],
})
for i in range(len(testDataSet)):
classResult[trainTextAndIds[i]['id']] = {
"train_id": listClasses[i],
"test_id": getTestWordClassId(wordJson, testDataSet[i], trainTextAndIds[i]['text']),
"id": trainTextAndIds[i]['id'],
"text": trainTextAndIds[i]['text']
}
# 4.計算召回率正確率
accuracy_recall_list = getRate(classResult, set(listClasses))
複製代碼