shell分析apache日誌

最近公司要作系統總體監控,因此我被分派寫關於apache日誌的分析,聽說公司天天的access_log最大高達10G【約8千萬行】,也不知道這個程序的處理效果如何。比較了PERL、sed、awk的速度發現,基本上從資源消耗上講perl對RSS\SHARE\MEM的消耗大於sed、awk,但速度與CPU消耗明顯小於2者。awk很少說了,什麼都佔用很大,並且一個字「慢」。最後發現sed的處理速度與CPU消耗基本與perl差很少,如處理1KW行數據時,perl耗時28秒,sed耗時42秒。因此最終選擇了sed暫時處理該日誌分析程序
 
apache原始日誌格式以下:
 
2008.china.com.cn 172.16.20.73 - - [31/Jul/2008:16:52:05 +0800] "GET / HTTP/1.0" 304 - "-" "Wget/1.10.1 (Red Hat modified)"
==================================================================================
 
輸出以下:
 
start_time
client_request
client_kbyte_out
sys_http200   
sys_http304   
sys_http403   
sys_http404   
sys_http500   
sys_http503
end_time
取5分鐘內的平均值
==================================================================================
 
程序以下:
 
#!/bin/bash
#Program log.sh
#Version v1.2_3
#By IORI
#Create Date 2008-12-19 13:40
#Last Modify 2008-12-23 16:55
 
#################################################
interval=5    #####間隔時間
#interval=300 #####理論使用300秒較合適
MINUTE_LOG='./apache.tmp.log' ###截取5分鐘內臨時日誌文件
FINAL_LOG='./apache.final.log'###生成分析日誌文件
#################################################
##################HELP FUNCTION###################定義幫助函數
help()
{
 echo "Usage:: $0  <Input_apache_log>"
}
#################HELP FUNCTION END###############
 
#################TEST $1 VALUE###################測試命令行參數是否爲1個
   if [ $# -ne 1 ];then
            help
            exit 1
         else
           APACHE_LOG=$1
   fi
#################################################
#################GET FILE STATUS#################################################
#FILE_STATUS_TIME=`stat -c %y $APACHE_LOG |awk -F '.' '{print $1}'`
#TIME_STRING=`date -d "$FILE_STATUS_TIME" +%s`
#################################################################################
#################PROGRAM START#####################################################
#################程序 開始##########################################################
 
if [ ! -f $APACHE_LOG ];then #測試apache日誌是否存在
   
   echo "You input $APACHE_LOG is exist" && exit 2
  else
       while :
 
            do
                rm -f $MINUTE_LOG
          
         
                GET_TIME=${GET_TIME:-`head -n 1 $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`} #獲取一個時間戳_默認提取日誌文件第一行 提取其中的[31/Jul/2008:16:52:05 值 轉換爲 "31 Jul 2008 16:52:05" 格式

                AWK_MINUTE=5  
                #CONTROL=0

                   for((i=300;i>=0;i=i-60))
 
                    do
                      
                       DATE_MINUTE=`date -d "$GET_TIME" +%s|awk '{print strftime("%Y:%H:%M",$0)}'` #將 "31 Jul 2008 16:52:05" 轉換成 "2008:16:52"
                       DATE_END_MINUTE=`date -d "$GET_TIME" +%s|awk -v second="$i" '{print strftime("%Y:%H:%M",$0+second)}'` #將 "31 Jul 2008 16:52:05" 轉換成 "2008:16:52" 並加 second 秒數 即得到5分鐘後的時間戳

                    
                     grep -i "$DATE_END_MINUTE" $APACHE_LOG > /dev/null #篩選5分鐘後的時間戳是否存在於apache日誌中
                   
                     if [ $? == 0 ];then # 成功
                             
                             if [ $DATE_MINUTE != $DATE_END_MINUTE ] ;then #則判斷開始時間戳與結束時間戳是否相等
                               
                 START_LINE=`sed -n "/$DATE_MINUTE/=" $APACHE_LOG|head -n1` #若是不相等,則取出開始時間戳的行號,與結束時間戳的行號

                                #END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|tail -n1`
                                END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|head -n1`
                              
                                sed -n "${START_LINE},${END_LINE}p" $APACHE_LOG > $MINUTE_LOG ##經過行號,取出5分鐘內的日誌內容 存放到 臨時文件中
                               
           GET_START_TIME=`sed -n "${START_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #經過行號獲取取出開始時間戳
           GET_END_TIME=`sed -n "${END_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #經過行號獲取結束時間戳

                              
                          START_TIME=`date -d "$GET_START_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                          END_TIME=`date -d "$GET_END_TIME" +%s|awk  '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                  NEXT_LINE=`sed -n "$(expr $END_LINE + 1)p" $APACHE_LOG`#得到下一行的行號

                          GET_TIME=`echo $NEXT_LINE|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #經過行號獲取下一行的時間戳
                               
                              break #處理完成 跳出  for((i=300;i>=0;i=i-60)) 循環
                            
                             else #即 開始時間戳等於結束時間戳

######################下面是開始時間戳與結束時間戳相等的處理,解釋基本同上##########
                                 sed -n "/$DATE_END_MINUTE/p" $APACHE_LOG > $MINUTE_LOG
                                 START_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|head -n1`
                                 END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|tail -n1`
          GET_START_TIME=`sed -n "${START_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
          GET_END_TIME=`sed -n "${END_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
                                 START_TIME=`date -d "$GET_START_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                                 END_TIME=`date -d "$GET_END_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                                 NEXT_LINE=`sed -n "$(expr $(sed -n "/$DATE_END_MINUTE/="  $APACHE_LOG|tail -n1) + 1)p" $APACHE_LOG`
                                 GET_TIME=`echo $NEXT_LINE|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
                                 AWK_MINUTE=1
                                 #CONTROL=`expr $CONTROL + 1 `
                          
                                  #  if [ $CONTROL -eq 2  ];then
                                  
                                  #   echo "#####apache is wrong please check#####" && exit 5
                                  # fi
 
                                 break#跳出 for((i=300;i>=0;i=i-60))循環
                          
                           fi
#################################################################################    
                      else #即沒有grep出結束時間戳
                           
                         AWK_MINUTE=`expr $AWK_MINUTE - 1`
       
               continue #繼續執行 for((i=300;i>=0;i=i-60))   循環                  
                     fi       
                    
            done # for ((i=300;i>=0;i=i-60))循環 結束

           #sed -n "/$DATE_MINUTE:[0-5]\+[0-9]\+/,/$DATE_END_MINUTE:[0-5]\+[0-9]\+/p" $APACHE_LOG
          
   ####################################分析5分鐘內日誌######################
          
            #start_time
            #echo "start_time="`date -d "$GET_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$1)}'`| tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            echo "start_time=$START_TIME"| tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics client_request
      awk -v minute=$AWK_MINUTE '{if($7~/GET|HEAD|POST|PUT/) count++}END{printf "client_request=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics client_kbyte_out
            awk -v minute=$AWK_MINUTE '{if($10~/20[0-6]|30[0-5]/)BYTE+=$11}END{printf "client_kbyte_out=%.4f KB ",BYTE/1024/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http200
            awk -v minute=$AWK_MINUTE '{if($10~/200/) count++}END{printf "sys_http200=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http304
            awk -v minute=$AWK_MINUTE '{if($10~/304/) count++}END{printf "sys_http304=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http403
            awk -v minute=$AWK_MINUTE  '{if($10~/403/) count++}END{printf "sys_http403=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http404
            awk -v minute=$AWK_MINUTE '{if($10~/404/) count++}END{printf "sys_http404=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http500
            awk -v minute=$AWK_MINUTE '{if($10~/500/) count++}END{printf "sys_http500=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http503
            awk -v minute=$AWK_MINUTE '{if($10~/503/) count++}END{printf "sys_http503=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #end_time
            #echo "end_time="`date -d "$GET_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0+300)}'` | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            echo "end_time=$END_TIME" | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
          
            #####test####################
            #echo '$MIN_AWK=' $AWK_MINUTE
            #############################
            echo -e ' ' | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
                
    ###############################apache 5分鐘 日誌分析完#######################
          
              sleep $interval&&rm -f $MINUTE_LOG#等待interval時間刪除臨時文件
   
                       
       done
fi
#################PROGRAM END###############################################

 
 
後記:在程序篩選過程當中發覺,apache日誌有不連續的情況,因此按照開始的思想STAR=1,END=1+300
而後STAR=END,END=END+300的思想沒法使用了,折騰了半天寫了一個半殘廢的算法,大概實現了每5分鐘內取日誌後統計分析。算法

相關文章
相關標籤/搜索