#! -*- coding:utf-8 -*-python
import pymongo
import codecs,sys
from pymongo import MongoClient
import jieba
from gensim import corpora, models, similarities
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
from pprint import pprint # pretty-printer
from flask import Flask,request
import trainkick
print "Flask server..."
app = Flask(__name__)
app.config.from_object(__name__)
#flatpages = FlatPages(app)
questid=''
@app.route('/queryill')
def index():
questid=request.args.get("uuid",'')
queryit(questid)
return 'success'
#reload(sys)
#sys.setdefaultencoding('utf-8')
#look2 = codecs.lookup("utf-8")
kickpath="/root/python/"
conn = MongoClient("xxx",27017)
db = conn.health
db.authenticate("xxx","xxx3")
dics=[]
dits={}
labels={}
count=1
mydoclist =[]
courses=[]
uuids=[]
content = db.kickchufang.find({})
for i in content:
line=str(i['chufang'].encode("gb2312"))
uuid=i['uuid']
uuids.append(uuid)
print uuid,line
courses.append(line)
courses_name = courses
#lib_texts =trainkick. pre_process_cn(courses)
#庫創建完成 -- 這部分可能數據很大,能夠預先處理好,存儲起來
dictionary = corpora.Dictionary.load(kickpath+'kick.dict')
print dictionary
lsi=models.LsiModel.load(kickpath+"kick.lsi")
index=similarities.MatrixSimilarity.load(kickpath+"kick.index")
def queryit(questionid):
# 要處理的對象登場
record = db.kickasking.find_one({'uuid': questionid})
if (record):
line = str(record['desc'].encode("gb2312"))
print line
questions = [line]
target_courses = questions # [u'石膏']
# print "target_courses",target_courses
target_text = trainkick.pre_process_cn(target_courses, low_freq_filter=False)
"""
test similary
"""
# 選擇一個基準數據
ml_course = target_text[0]
print "ml_course", ml_course
# 詞袋處理
ml_bow = dictionary.doc2bow(ml_course)
print "ml_bow", ml_bow
# 在上面選擇的模型數據 lsi 中,計算其餘數據與其的類似度
ml_lsi = lsi[ml_bow] # ml_lsi 形式如 (topic_id, topic_value)
sims = index[ml_lsi] # sims 是最終結果了, index[xxx] 調用內置方法 __getitem__() 來計算ml_lsi
# 排序,爲輸出方便
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
# 查看結果
print sort_sims[0:10] # 看下前10個最類似的,第一個是基準數據自身
print uuids[sort_sims[0][0]], sort_sims[0][1]
print uuids[sort_sims[1][0]], sort_sims[1][1]
print uuids[sort_sims[2][0]], sort_sims[2][1]
doc = [{"questid": questionid, "chufangid": uuids[sort_sims[0][0]], "score": float(sort_sims[0][1])}]
db.kickanswer.insert(doc)
# print sort_sims[1][1] #看下實際最類似的數據叫什麼
# print courses_name[sort_sims[2][0]] #看下實際最類似的數據叫什麼
# print courses_name[sort_sims[3][0]] #看下實際最類似的數據叫什麼
if __name__ == '__main__':
app.run(host='0.0.0.0', port=9000)flask