簡單的日誌統計是不須要使用重量級的Hadoop,我用python實現了日誌的統計。原理是用fabric登陸到遠程linux,組合使用grep、uniq、sort、awk對日誌進行操做,能夠根據正則表達式指定規則抽取符合規則的日誌,作查詢,計數,分類統計。 html
注意:要安裝fabric庫 python
主文件:LogQuery.py linux
#encoding=utf-8 from fabric.api import run,env,local,cd from fabric.tasks import execute,abort from fabric.contrib.console import confirm import logging logging.basicConfig(format='[%(levelname)s]: %(message)s', level=logging.DEBUG) logger = logging.getLogger(__name__) logging.getLogger('paramiko.transport').setLevel(logging.ERROR) logger.setLevel(logging.DEBUG) EXECUTE_RESULT = {} def hosts(hostarr): ''' set hosts hostarr:[(hostname,password),(hostname,password)...] ''' env.hosts = [x[0] for x in hostarr] env.passwords = dict(x for x in hostarr) def query(expression,hostname,logfile,unique=True,sort=None,output=None,pattern=None,path=None): ''' expression: regex rule hostname: hostname as specified hosts() logfile: log file name, wildcard supported, eg:*.log unique: whether result is unique sort: 1(ASC) or -1(DESC) ,default None output:None or file name, default None imply print stream pattern: group pattern , default None imply '1' path: cd to path before execution ''' if not path: path = r'.' cmd_str = generate_cmd(expression,logfile,unique,sort,output,pattern) execute(executor,hostname,cmd_str,path,host=hostname) result = EXECUTE_RESULT[hostname] return result def aggregate(expression,hostname,logfile,output=None,pattern=None,path=None): ''' expression: regex rule hostname: hostname as specified hosts() logfile: log file name, wildcard supported, eg:*.log output:None or file name, default None imply print stream pattern: group pattern , default None imply '1' path: cd to path before execution ''' if not path: path = r'.' cmd_str = generate_cmd(expression,logfile,False,None,output,pattern,True,True) execute(executor,hostname,cmd_str,path,host=hostname) result = EXECUTE_RESULT[hostname] return result def count(expression,hostname,logfile,unique=True,sort=None,output=None,pattern=None,path=None): ''' expression: regex rule hostname: hostname as specified hosts() logfile: log file name, wildcard supported, eg:*.log unique: whether result is unique sort: 1(ASC) or -1(DESC) ,default None output:None or file name, default None imply print stream pattern: group pattern , default None imply '1' path: cd to path before execution ''' if not path: path = r'.' cmd_str = generate_cmd(expression,logfile,unique,sort,output,pattern,True) execute(executor,hostname,cmd_str,path,host=hostname) result = EXECUTE_RESULT[hostname] if result: result = int(result[0]) return result def executor(hostname,cmd_str,path=None): ''' executor , called by execute ''' if not path: path = r'.' with cd(path): res = run(cmd_str,quiet=True) logger.debug('Command: %s:%s > %s'%(hostname,path,cmd_str)) logger.debug('Command Execute Successful:%s, Failure:%s'%(res.succeeded,res.failed)) EXECUTE_RESULT[hostname] = res.splitlines() def generate_cmd(expression,logfile,unique=True,sort=None,output=None,pattern=None,count=False,aggregate=False): ''' generate command ''' if not pattern: pattern = r'\1' if aggregate: aggregate = '''| awk '{a[$1]++}END{for (j in a) print j","a[j]}' ''' unique = False sort = False count = False else: aggregate = '' if not unique: unique = '' else: unique = '| uniq' if sort: if sort==1: sort = '| sort' elif sort==-1: sort = '| sort -r' else: sort = '' else: sort = '' if count: count = '| wc -l' else: count = '' if output: output = '>%s'%output else: output = '' cmd_str = '''cat %s | grep "%s" | sed 's/%s/%s/g' %s %s %s %s %s'''%(logfile,expression,expression,pattern,unique,sort,count,output,aggregate) return cmd_str
假設你的日誌是這樣的: 正則表達式
spider.A crawled http://www.163.com/abc.html spider.A crawled http://www.yahoo.com/xyz.html spider.B crawled http://www.baidu.com/mnq.html other log no crawing infomation involved spider.C crawled http://www.sina.com.cn/yyy.html
使用案例:test.py shell
#encoding=utf-8 import LogQuery #定義多個主機,用戶名@主機,登陸密碼 myhosts = [('rootman@192.168.2.228','123'),('rootman@192.168.2.229','123'),('rootman@192.168.2.219','123')] LogQuery.hosts(myhosts) ''' 案例1: 查詢有哪些域名被抓取過,使用query方法,會返回全部符合規則的數據 預期返回: www.163.com www.yahoo.com ... ''' res = LogQuery.query('\(.*crawled http:\/\/\)\([^\/]*\)\(\/.*\)',myhosts[0][0],'gcrawler.*.log',unique=True,sort=None,output=None,pattern=r'\2',path='/home/workspace/Case/trunk/src/gcrawler/log') ''' 上一行代碼解讀: 第一個參數指定了表示抓取的日誌正則表達式,而且將其分組(爲了提取域名),分組的括號用\(,\)表示,第二組是域名的提取。 第二個參數指定了要查詢那一臺主機上的日誌 第三個參數指定了要分析的日誌文件名,*表示任何字符 第四個參數unique,是否對返回的條目進行排重,例如:日誌中發現多個www.163.com,只算一個 第五個參數sort,是否須要對抽取的條目進行排序,1:正序、-1:倒序,這裏爲None,即不須要排序 第六個參數output,能夠指定運行結果輸出到某個文件,在這裏不須要輸出,爲None 第七個參數pattern,是指從正則表達式中抽取哪一個分組,默認是第一組,這裏用r'\2'指定第二組 第八個參數path指定了日誌在操做系統上所在的目錄 如下的count、aggregate方法使用的參數和query都是同樣的意義 ''' ''' 案例2: 統計被抓取過的域名有幾個,使用count方法,會返回全部符合規則的統計總數 預期返回:4 ... ''' res = LogQuery.count('\(.*crawled http:\/\/\)\([^\/]*\)\(\/.*\)',myhosts[1][0],'gcrawler.*.log',unique=True,sort=None,output=None,pattern=r'\2',path='/home/workspace/Case/trunk/src/gcrawler/log') ''' 案例3: 分別統計每一個域名被抓取的數量 返回的結果: 域名1,統計數字 域名2,統計數字 ... ''' res = LogQuery.aggregate('\(.*crawled http:\/\/\)\([^\/]*\)\(\/.*\)',myhosts[2][0],'gcrawler.*.log',output=None,pattern=r'\2',path='/home/workspace/Case/trunk/src/gcrawler/log')#這裏是分類統計就不必指定unique和sort了。 #打印分組統計的狀況 for i in res: domain,count = i.split(',') total += int(count) print domain,'=>',count