(大量用戶名--包含正經常使用戶名與非法用戶名)python
包含兩個txt文件 legal_name.txt ilegal_name.txt. 以下圖所示
api
# Data sets import os import pandas as pd DATAPATH = "../dataset" POS = os.path.join(DATAPATH, "legal_name.txt") POS_OUTPUT = os.path.join(DATAPATH, "legal_name.csv") NEG = os.path.join(DATAPATH, "ilegal_name.txt") NEG_OUTPUT = os.path.join(DATAPATH, "ilegal_name.csv") def process_org_data(input_data, output_data, lable): reader = pd.read_csv(input_data, iterator=True) while True: try: train = reader.get_chunk(10000) train['username'] = train['username'].astype(str) train['username'] = map(lambda x: x.strip(), train['username']) train['length'] = train['username'].apply(len) ... ... train['label'] = map(lambda x: lable, train['username']) train.to_csv(output_data, encoding='utf-8', mode='a', index=False, header=False) except StopIteration: print "Iteration is stopped." break if __name__ == '__main__': process_org_data(POS, POS_OUTPUT, 1) process_org_data(NEG, NEG_OUTPUT, 0)根據需求提取相應的特徵, 輸出成 csv 格式,包含特徵列與label列
把合法用戶dataset與非法用戶dataset,合併打亂,切割成 train.csv 和 test.csvsession
pos_dataset = read_dataset(POS) neg_dataset = read_dataset(NEG) dataset = pd.concat([pos_dataset, neg_dataset]) dataset = dataset.sample(frac=1).reset_index(drop=True) train_data = dataset.loc[:200000, :] test_data = dataset.loc[200000:, :] train_data.to_csv(os.path.join(DataPath, "train.csv"), index=False) test_data.to_csv(os.path.join(DataPath, "test.csv"), index=False)
import pandas as pd import os import tensorflow as tf import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report import numpy as np tf.logging.set_verbosity(tf.logging.INFO) DataPath = "../dataset" TRAIN = os.path.join(DataPath, "train.csv") TEST = os.path.join(DataPath, "test.csv") COLUMNS = ["username", ... , "label"] train_dataset = pd.read_csv(TRAIN, skipinitialspace=True, skiprows=1, names=COLUMNS) test_dataset = pd.read_csv(TEST, skipinitialspace=True, skiprows=1, names=COLUMNS) for col in train_dataset.columns[1:]: train_dataset[col] = pd.to_numeric(train_dataset[col], errors='coerce') for col in test_dataset.columns[1:]: test_dataset[col] = pd.to_numeric(test_dataset[col], errors='coerce') X_train = train_dataset.iloc[:, range(1, 19)].values y_train = train_dataset.iloc[:, 19].values X_test = test_dataset.iloc[:, range(1, 19)].values y_test = test_dataset.iloc[:, 19].values def build_model(): ############ model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(64, input_dim=18)) # model.add(tf.keras.layers.BatchNormalization()) model.add(tf.keras.layers.Activation('relu')) model.add(tf.keras.layers.Dense(32)) # model.add(tf.keras.layers.BatchNormalization()) model.add(tf.keras.layers.Activation('relu')) model.add(tf.keras.layers.Dense(16)) # model.add(tf.keras.layers.BatchNormalization()) model.add(tf.keras.layers.Activation('relu')) model.add(tf.keras.layers.Dense(1, activation='sigmoid')) return if __name__ == '__main__': model_file = './my_model.h5' if (os.path.isfile(model_file)): print('model file detected. Loading.') model = tf.keras.models.load_model(model_file) else: print('No model file detected. Starting from scratch.') model = build_model() model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy']) model.fit(X_train, y_train, batch_size=100, epochs=1, validation_data=(X_test, y_test))模型輸出爲 my_model.h5 , 準確率百分之90%
def save_model_for_production(model, version, path='prod_models'): tf.keras.backend.set_learning_phase(1) if not os.path.exists(path): os.mkdir(path) export_path = os.path.join( tf.compat.as_bytes(path), tf.compat.as_bytes(version)) builder = tf.saved_model.builder.SavedModelBuilder(export_path) model_input = tf.saved_model.utils.build_tensor_info(model.input) model_output = tf.saved_model.utils.build_tensor_info(model.output) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'inputs': model_input}, outputs={'output': model_output}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) with tf.keras.backend.get_session() as sess: builder.add_meta_graph_and_variables( sess=sess, tags=[tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict': prediction_signature, }) builder.save()
導出爲 tensorflow serving 模型app
export_path = "tf-model" save_model_for_production(model, "7", export_path)
/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_name=username --model_base_path=/data/model/tf-model
#!/usr/bin/env python # encoding: utf-8 """ @version: v1.0 @author: zwqjoy @contact: zwqjoy@163.com @site: https://blog.csdn.net/zwqjoy @file: client @time: 2018/6/29 15:02 """ from grpc.beta import implementations import tensorflow as tf from tensorflow_serving.apis import predict_pb2 from tensorflow_serving.apis import prediction_service_pb2 import numpy as np tf.app.flags.DEFINE_string('server', '172.xxx.xxx.xxx:9000', 'PredictionService host:port') tf.app.flags.DEFINE_string('username', 'demo_user', '傳入一個username') FLAGS = tf.app.flags.FLAGS def nametovec(username): username = username.astype(str) length = len(username) ... ... return np.array([length, ...]) def main(_): host, port = FLAGS.server.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) # Send request # See prediction_service.proto for gRPC request/response details. data = nametovec(FLAGS.username) data = data.astype(np.float32) request = predict_pb2.PredictRequest() request.model_spec.name = 'username' # 這個name跟tensorflow_model_server --model_name="username" 對應 request.model_spec.signature_name = 'predict' # 這個signature_name 跟signature_def_map 對應 request.inputs['inputs'].CopyFrom( tf.contrib.util.make_tensor_proto(data, shape=(1, 18))) result = stub.Predict(request, 10.0) # 10 secs timeout print(result) if __name__ == '__main__': tf.app.run()