hadoop日誌分析

1、項目要求 前端


  • 本文討論的日誌處理方法中的日誌,僅指Web日誌。事實上並無精確的定義,可能包含但不限於各類前端Webserver——apache、lighttpd、nginx、tomcat等產生的用戶訪問日誌,以及各類Web應用程序本身輸出的日誌。  


2、需求分析: KPI指標設計 java

 PV(PageView): 頁面訪問量統計
 IP: 頁面獨立IP的訪問量統計
 Time: 用戶每小時PV的統計
 Source: 用戶來源域名的統計
 Browser: 用戶的訪問設備統計 nginx

如下我着重分析瀏覽器統計 angularjs

3、分析過程 算法

一、 日誌的一條nginx記錄內容 apache

222.68.172.190  - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 
"http://www.angularjs.cn/A00n" 
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 瀏覽器

二、對上面的日誌記錄進行分析 tomcat

remote_addr : 記錄client的ip地址, 222.68.172.190
remote_user :  記錄clientusername稱, –
time_local:  記錄訪問時間與時區, [18/Sep/2013:06:49:57 +0000]
request: 記錄請求的url與http協議, 「GET /images/my.jpg HTTP/1.1″
status:  記錄請求狀態,成功是200, 200
body_bytes_sent:  記錄發送給client文件主體內容大小, 19939
http_referer:  用來記錄從那個頁面連接訪問過來的, 「http://www.angularjs.cn/A00n」
http_user_agent:  記錄客戶瀏覽器的相關信息, 「Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″   app

三、java語言分析上面一條日誌記錄(使用空格切分) ide

1 String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";
2         String[] elementList = line.split(" ");
3         for(int i=0;i<elementList.length;i++){
4             System.out.println(i+" : "+elementList[i]);
5         }

測試結果:

01 0 : 222.68.172.190
02 1 : -
03 2 : -
04 3 : [18/Sep/2013:06:49:57
05 4 : +0000]
06 5 : "GET
07 6 : /images/my.jpg
08 7 : HTTP/1.1"
09 8 : 200
10 9 : 19939
11 10 : "http://www.angularjs.cn/A00n"
12 11 : "Mozilla/5.0
13 12 : (Windows
14 13 : NT
15 14 : 6.1)
16 15 : AppleWebKit/537.36
17 16 : (KHTML,
18 17 : like
19 18 : Gecko)
20 19 : Chrome/29.0.1547.66
21 20 : Safari/537.36"
四、實體Kpi類的代碼:
01 public class Kpi {
02     private String remote_addr;// 記錄client的ip地址
03     private String remote_user;// 記錄clientusername稱,忽略屬性"-"
04     private String time_local;// 記錄訪問時間與時區
05     private String request;// 記錄請求的url與http協議
06     private String status;// 記錄請求狀態;成功是200
07     private String body_bytes_sent;// 記錄發送給client文件主體內容大小
08     private String http_referer;// 用來記錄從那個頁面連接訪問過來的
09     private String http_user_agent;// 記錄客戶瀏覽器的相關信息
10     private String method;//請求方法 get post
11     private String http_version; //http版本號
12       
13     public String getMethod() {
14         return method;
15     }
16     public void setMethod(String method) {
17         this.method = method;
18     }
19     public String getHttp_version() {
20         return http_version;
21     }
22     public void setHttp_version(String http_version) {
23         this.http_version = http_version;
24     }
25     public String getRemote_addr() {
26         return remote_addr;
27     }
28     public void setRemote_addr(String remote_addr) {
29         this.remote_addr = remote_addr;
30     }
31     public String getRemote_user() {
32         return remote_user;
33     }
34     public void setRemote_user(String remote_user) {
35         this.remote_user = remote_user;
36     }
37     public String getTime_local() {
38         return time_local;
39     }
40     public void setTime_local(String time_local) {
41         this.time_local = time_local;
42     }
43     public String getRequest() {
44         return request;
45     }
46     public void setRequest(String request) {
47         this.request = request;
48     }
49     public String getStatus() {
50         return status;
51     }
52     public void setStatus(String status) {
53         this.status = status;
54     }
55     public String getBody_bytes_sent() {
56         return body_bytes_sent;
57     }
58     public void setBody_bytes_sent(String body_bytes_sent) {
59         this.body_bytes_sent = body_bytes_sent;
60     }
61     public String getHttp_referer() {
62         return http_referer;
63     }
64     public void setHttp_referer(String http_referer) {
65         this.http_referer = http_referer;
66     }
67     public String getHttp_user_agent() {
68         return http_user_agent;
69     }
70     public void setHttp_user_agent(String http_user_agent) {
71         this.http_user_agent = http_user_agent;
72     }
73     @Override
74     public String toString() {
75         return "Kpi [remote_addr=" + remote_addr + ", remote_user="
76                 + remote_user + ", time_local=" + time_local + ", request="
77                 + request + ", status=" + status + ", body_bytes_sent="
78                 + body_bytes_sent + ", http_referer=" + http_referer
79                 + ", http_user_agent=" + http_user_agent + ", method=" + method
80                 + ", http_version=" + http_version + "]";
81     }
82   
83       
84       
85 }
五、kpi的工具類
01 package org.aaa.kpi;
02   
03 public class KpiUtil {
04     /***
05      * line記錄轉化成kpi對象
06      * @param line 日誌的一條記錄
07      * @author tianbx
08      * */
09     public static Kpi transformLineKpi(String line){
10         String[] elementList = line.split(" ");
11         Kpi kpi = new Kpi();
12         kpi.setRemote_addr(elementList[0]);
13         kpi.setRemote_user(elementList[1]);
14         kpi.setTime_local(elementList[3].substring(1));
15         kpi.setMethod(elementList[5].substring(1));
16         kpi.setRequest(elementList[6]);
17         kpi.setHttp_version(elementList[7]);
18         kpi.setStatus(elementList[8]);
19         kpi.setBody_bytes_sent(elementList[9]);
20         kpi.setHttp_referer(elementList[10]);
21         kpi.setHttp_user_agent(elementList[11] + " " + elementList[12]);
22         return kpi;
23     }
24 }

六、算法模型: 並行算法 

Browser: 用戶的訪問設備統計
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)} 
七、map-reduce分析代碼


01 import java.io.IOException;
02 import java.util.Iterator;
03   
04 import org.apache.hadoop.fs.Path;
05 import org.apache.hadoop.io.IntWritable;
06 import org.apache.hadoop.io.Text;
07 import org.apache.hadoop.mapred.FileInputFormat;
08 import org.apache.hadoop.mapred.FileOutputFormat;
09 import org.apache.hadoop.mapred.JobClient;
10 import org.apache.hadoop.mapred.JobConf;
11 import org.apache.hadoop.mapred.MapReduceBase;
12 import org.apache.hadoop.mapred.Mapper;
13 import org.apache.hadoop.mapred.OutputCollector;
14 import org.apache.hadoop.mapred.Reducer;
15 import org.apache.hadoop.mapred.Reporter;
16 import org.apache.hadoop.mapred.TextInputFormat;
17 import org.apache.hadoop.mapred.TextOutputFormat;
18 import org.hmahout.kpi.entity.Kpi;
19 import org.hmahout.kpi.util.KpiUtil;
20   
21 import cz.mallat.uasparser.UASparser;
22 import cz.mallat.uasparser.UserAgentInfo;
23   
24 public class KpiBrowserSimpleV {
25   
26     public static class KpiBrowserSimpleMapper extends MapReduceBase 
27         implements Mapper<Object, Text, Text, IntWritable> {
28         UASparser parser = null;
29         @Override
30         public void map(Object key, Text value,
31                 OutputCollector<Text, IntWritable> out, Reporter reporter)
32                 throws IOException {
33             Kpi kpi = KpiUtil.transformLineKpi(value.toString());
34   
35             if(kpi!=null && kpi.getHttP_user_agent_info()!=null){
36                 if(parser==null){
37                     parser = new UASparser();
38                 }
39                 UserAgentInfo info = 
40                 parser.parseBrowserOnly(kpi.getHttP_user_agent_info());
41                 if("unknown".equals(info.getUaName())){
42                     out.collect(new Text(info.getUaName()), new IntWritable(1));
43                 }else{
44                     out.collect(new Text(info.getUaFamily()), new IntWritable(1));
45                 }
46   
47             }
48         }
49     }
50   
51     public static class KpiBrowserSimpleReducer extends MapReduceBase implements
52         Reducer<Text, IntWritable, Text, IntWritable>{
53   
54         @Override
55         public void reduce(Text key, Iterator<IntWritable> value,
56                 OutputCollector<Text, IntWritable> out, Reporter reporter)
57                 throws IOException {
58             IntWritable sum = new IntWritable(0);
59             while(value.hasNext()){
60                 sum.set(sum.get()+value.next().get());
61             }
62             out.collect(key, sum);
63         }
64     }
65     public static void main(String[] args) throws IOException {
66         String input = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/input";
67         String output ="hdfs://127.0.0.1:9000/user/tianbx/log_kpi/browerSimpleV";
68         JobConf conf = new JobConf(KpiBrowserSimpleV.class);
69         conf.setJobName("KpiBrowserSimpleV");
70         String url = "classpath:";
71         conf.addResource(url+"/hadoop/core-site.xml");
72         conf.addResource(url+"/hadoop/hdfs-site.xml");
73         conf.addResource(url+"/hadoop/mapred-site.xml");
74           
75         conf.setMapOutputKeyClass(Text.class);
76         conf.setMapOutputValueClass(IntWritable.class);
77           
78         conf.setOutputKeyClass(Text.class);
79         conf.setOutputValueClass(IntWritable.class);
80           
81         conf.setMapperClass(KpiBrowserSimpleMapper.class);
82         conf.setCombinerClass(KpiBrowserSimpleReducer.class);
83         conf.setReducerClass(KpiBrowserSimpleReducer.class);
84   
85         conf.setInputFormat(TextInputFormat.class);
86         conf.setOutputFormat(TextOutputFormat.class);
87   
88         FileInputFormat.setInputPaths(conf, new Path(input));
89         FileOutputFormat.setOutputPath(conf, new Path(output));
90   
91         JobClient.runJob(conf);
92         System.exit(0);
93     }
94   
95 }


八、輸出文件log_kpi/browerSimpleV內容

AOL Explorer 1
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685

8 R製做圖片


data<-read.table(file="borwer.txt",header=FALSE,sep=",") 

 names(data)<-c("borwer","num")

 qplot(borwer,num,data=data,geom="bar")



解決這個問題

一、排除爬蟲和程序點擊,對抗做弊

解決的方法:頁面作個檢測鼠標是否動。

二、瀏覽量 怎麼排除圖片

三、瀏覽量排除假點擊?

四、哪個搜索引擎訪問的?

五、點擊哪個keyword訪問的?

六、從哪個地方訪問的?

七、使用哪個瀏覽器訪問的?

相關文章
相關標籤/搜索