準備給ZABBIX用的。python
統計接口訪問字次,平均響應時間,4XX,5XX次數nginx
之後能夠再改進。。算法
#!/usr/bin/env python # coding: utf-8 ################################### # User:chengang # # Email:aguncn@163.com # # Date:2016-02-25 # ################################### import time import datetime import sys import os import os.path import re import json class NginxLog(object): def __init__(self, log_file, interface_list, seek_file): self.log_file = log_file self.interface_list = interface_list self.seek_file = seek_file # 將輸出編碼成json格式 def jsonFormat(self, python_data): json_data = json.dumps(python_data, indent=2) return json_data # 獲取電腦主機名 def hostname(self): sys = os.name if sys == 'nt': hostname = os.getenv('computername') return hostname elif sys == 'posix': host = os.popen('echo $HOSTNAME') try: hostname = host.read() return hostname finally: host.close() else: return 'Unkwon hostname' # 將讀過的文件遊標寫入臨時文件 def writeSeek(self, seek): with open(self.seek_file,'w') as f: f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))+"\n") f.write(str(seek)+"\n") # 讀出新生成的日誌條目 def LogRead(self): # 若是第一次運行,或是刪除臨時文件,從頭運行,不然,從上次讀取以後運行 # 0表明從頭開始,1表明當前位置,2表明文件最末尾位置。 if os.path.exists(self.seek_file): with open(self.seek_file) as f: seek_tmp = f.readlines() seek_old = int(seek_tmp[1].strip()) else: seek_old = 0 with open(self.log_file) as f: #記錄當前最新文件遊標 f.seek(0,2) seek_now = f.tell() # 讀取上次讀完以後的日誌條目 if seek_now >= seek_old: f.seek(seek_old,0) chunk = f.read(seek_now-seek_old) # 也能夠考慮用xreadlines來實現 # for line in f.xreadlines(): # pass # do something # 若是文件遊標倒退,說明日誌文件已輪循,從頭開始 else: f.seek(0,0) chunk = f.read(seek_now) # 將此次的遊標寫入臨時文件 # self.writeSeek(seek_now) return chunk def LogStatistics(self): #分析NGINX日誌的正則表達示,若是日誌格式更改,則須要相應做更改 #我拿到的日誌樣本和鵬龍的不同,因此註釋了一個字段 #field 0 field_remote_addr = r"?P<l_remote_addr>.*" #field 1 field_remote_user = r"?P<l_remote_user>-" #field 2 field_time_local = r"?P<l_time_local>\[.*\]" #field 3 field_request = r"?P<l_request>\"[^\"]*\"" #field 4 field_status = r"?P<l_status>\"[^\"]*\"" #field 5 field_body_bytes_sent = r"?P<l_body_bytes_sent>\d+" #field 6 field_http_refere = r"?P<l_http_refere>\"[^\"]*\"" #field 7 field_http_user_agent = r"?P<l_http_user_agent>\"[^\"]*\"" #field 8 #field_http_x_fowarded_for = r"?P<l_http_x_fowarded_for>\"[^\"]*\"" #field 8 field_all_cookie = r"?P<l_all_cookie>\"[^\"]*\"" #field 9 field_gzip_ratio = r"?P<l_gzip_ratio>\"[^\"]*\"" #field 10 field_upstream_addr = r"?P<l_upstream_addr>.*" #field 11 field_bytes_sent = r"?P<l_bytes_sent>\d+" #field 12 field_request_length = r"?P<l_request_length>\d+" #field 13 field_request_time = r"?P<l_request_time>.*" #如下爲樣例,方便調試 ''' 10.25.162.22 - - [24/Feb/2016:14:09:25 +0800] "GET / HTTP/1.0" "200" 612 "-" "-" "-" "-" - 846 54 0.000 10.25.162.22 - - [24/Feb/2016:14:09:35 +0800] "GET /dsf/getRealTimeDatas?codes=&codeTypes=&_v=14562941753244588 HTTP/1.0" "200" 37 "http://asfdte/quote/dsftml" "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4" "-" "-" 10.25.174.34:30077 181 862 0.002 ''' # 正則匹配字段 nginxlog_pattern = re.compile(r"(%s)\s-\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)\s(%s)" \ %(field_remote_addr,field_remote_user,field_time_local,field_request,field_status, \ field_body_bytes_sent,field_http_refere,field_http_user_agent, \ field_all_cookie,field_gzip_ratio,field_upstream_addr,field_bytes_sent,field_request_length, \ field_request_time),re.VERBOSE) #輸出結果 result_list = [] # 未啓用字段,做佔位用 time_ns = datetime.datetime.now().microsecond #轉換成符合要求的時間秒格式 time_stamp = int(str(time.time())[0:10]) host_name = self.hostname() # 多少個URL,就要循環讀取多少次,算法粗糙,後面再想辦法吧,由於若是隻循環一次文件讀取,則在裏面要循環列表,還難理順思路 for interface_item in self.interface_list: # json格式樣例 {"ns":470464001,"clock":1450368176,"value":"1","key":"macs.func.exeCount_0ms_50ms[104_202]","host":"SQSZ-L4"}, # 構造符合要求的字典 interface_item_dict_count = {} interface_item_dict_avg_request_time = {} interface_item_dict_2xx = {} interface_item_dict_4xx = {} interface_item_dict_5xx = {} interface_item_dict_count['ns']=interface_item_dict_avg_request_time['ns']=interface_item_dict_2xx['ns']=interface_item_dict_4xx['ns']=interface_item_dict_5xx['ns']=time_ns interface_item_dict_count['clock']=interface_item_dict_avg_request_time['clock']=interface_item_dict_2xx['clock']=interface_item_dict_4xx['clock']=interface_item_dict_5xx['clock']=time_stamp interface_item_dict_count['host']=interface_item_dict_avg_request_time['host']=interface_item_dict_2xx['host']=interface_item_dict_4xx['host']=interface_item_dict_5xx['host']=host_name interface_item_dict_count['key'] = interface_item + '_count' interface_item_dict_count['value'] = 0 interface_item_dict_avg_request_time['key'] = interface_item + '_avg_request_time' interface_item_dict_avg_request_time['value'] = 0 interface_item_dict_2xx['key'] = interface_item + '_2xx' interface_item_dict_2xx['value'] = 0 interface_item_dict_4xx['key'] = interface_item + '_4xx' interface_item_dict_4xx['value'] = 0 interface_item_dict_5xx['key'] = interface_item + '_5xx' interface_item_dict_5xx['value'] = 0 hit_url_count = 0 for line in self.LogRead().split('\n'): line_matchs = nginxlog_pattern.match(line) if line_matchs!=None: #匹配字段 allGroups = line_matchs.groups() remote_addr = allGroups[0] #切割出真正的URL request_url = allGroups[3].split()[1].split('?')[0].split('/')[-1] status_code = allGroups[4] request_time = allGroups[13] # 匹配URL以後進行數據結構操做 if interface_item == request_url: hit_url_count += 1 interface_item_dict_count['value'] += 1 interface_item_dict_avg_request_time['value'] += float(request_time) if status_code.strip('\"').startswith('2'): interface_item_dict_2xx['value'] += 1 if status_code.strip('\"').startswith('4'): interface_item_dict_4xx['value'] += 1 if status_code.strip('\"').startswith('5'): interface_item_dict_5xx['value'] += 1 # 求平均請求反應時間 if interface_item_dict_avg_request_time['value'] != 0: interface_item_dict_avg_request_time['value'] = interface_item_dict_avg_request_time['value'] / hit_url_count #入總列表 result_list.append(interface_item_dict_count) result_list.append(interface_item_dict_avg_request_time) result_list.append(interface_item_dict_2xx) result_list.append(interface_item_dict_4xx) result_list.append(interface_item_dict_5xx) return self.jsonFormat(result_list) def resultOutput(self): pass def main(): #須要收集的url interface_list = ['getIndexData', \ 'getRealTimeDatas', 'hehel',] #日誌定位 log_file = 'd:\\demo\\sample.log' # 臨時文件遊標文件 seek_file = 'd:\\demo\\log_check_seek.tmp' nginx_log = NginxLog(log_file, interface_list, seek_file) return_json_data = nginx_log.LogStatistics() print return_json_data if __name__ == "__main__": main()