1、項目要求 css
2、需求分析: KPI指標設計 html
PV(PageView): 頁面訪問量統計
IP: 頁面獨立IP的訪問量統計
Time: 用戶每小時PV的統計
Source: 用戶來源域名的統計
Browser: 用戶的訪問設備統計 前端
如下我着重分析瀏覽器統計 java
3、分析過程 nginx
一、 日誌的一條nginx記錄內容 angularjs
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939
"http://www.angularjs.cn/A00n"
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 算法
二、對上面的日誌記錄進行分析 apache
remote_addr : 記錄client的ip地址, 222.68.172.190
remote_user : 記錄clientusername稱, –
time_local: 記錄訪問時間與時區, [18/Sep/2013:06:49:57 +0000]
request: 記錄請求的url與http協議, 「GET /images/my.jpg HTTP/1.1″
status: 記錄請求狀態,成功是200, 200
body_bytes_sent: 記錄發送給client文件主體內容大小, 19939
http_referer: 用來記錄從那個頁面連接訪問過來的, 「http://www.angularjs.cn/A00n」
http_user_agent: 記錄客戶瀏覽器的相關信息, 「Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″ 瀏覽器
三、java語言分析上面一條日誌記錄(使用空格切分) tomcat
1 |
String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"" ; |
2 |
String[] elementList = line.split( " " ); |
3 |
for ( int i= 0 ;i<elementList.length;i++){ |
4 |
System.out.println(i+ " : " +elementList[i]); |
5 |
} |
測試結果:
01 |
0 : 222.68 . 172.190 |
02 |
1 : - |
03 |
2 : - |
04 |
3 : [ 18 /Sep/ 2013 : 06 : 49 : 57 |
05 |
4 : + 0000 ] |
06 |
5 : "GET |
07 |
6 : /images/my.jpg |
08 |
7 : HTTP/ 1.1 " |
09 |
8 : 200 |
10 |
9 : 19939 |
11 |
10 : "http://www.angularjs.cn/A00n" |
12 |
11 : "Mozilla/ 5.0 |
13 |
12 : (Windows |
14 |
13 : NT |
15 |
14 : 6.1 ) |
16 |
15 : AppleWebKit/ 537.36 |
17 |
16 : (KHTML, |
18 |
17 : like |
19 |
18 : Gecko) |
20 |
19 : Chrome/ 29.0 . 1547.66 |
21 |
20 : Safari/ 537.36 " |
01 |
public class Kpi { |
02 |
private String remote_addr; // 記錄client的ip地址 |
03 |
private String remote_user; // 記錄clientusername稱,忽略屬性"-" |
04 |
private String time_local; // 記錄訪問時間與時區 |
05 |
private String request; // 記錄請求的url與http協議 |
06 |
private String status; // 記錄請求狀態;成功是200 |
07 |
private String body_bytes_sent; // 記錄發送給client文件主體內容大小 |
08 |
private String http_referer; // 用來記錄從那個頁面連接訪問過來的 |
09 |
private String http_user_agent; // 記錄客戶瀏覽器的相關信息 |
10 |
private String method; //請求方法 get post |
11 |
private String http_version; //http版本號 |
12 |
|
13 |
public String getMethod() { |
14 |
return method; |
15 |
} |
16 |
public void setMethod(String method) { |
17 |
this .method = method; |
18 |
} |
19 |
public String getHttp_version() { |
20 |
return http_version; |
21 |
} |
22 |
public void setHttp_version(String http_version) { |
23 |
this .http_version = http_version; |
24 |
} |
25 |
public String getRemote_addr() { |
26 |
return remote_addr; |
27 |
} |
28 |
public void setRemote_addr(String remote_addr) { |
29 |
this .remote_addr = remote_addr; |
30 |
} |
31 |
public String getRemote_user() { |
32 |
return remote_user; |
33 |
} |
34 |
public void setRemote_user(String remote_user) { |
35 |
this .remote_user = remote_user; |
36 |
} |
37 |
public String getTime_local() { |
38 |
return time_local; |
39 |
} |
40 |
public void setTime_local(String time_local) { |
41 |
this .time_local = time_local; |
42 |
} |
43 |
public String getRequest() { |
44 |
return request; |
45 |
} |
46 |
public void setRequest(String request) { |
47 |
this .request = request; |
48 |
} |
49 |
public String getStatus() { |
50 |
return status; |
51 |
} |
52 |
public void setStatus(String status) { |
53 |
this .status = status; |
54 |
} |
55 |
public String getBody_bytes_sent() { |
56 |
return body_bytes_sent; |
57 |
} |
58 |
public void setBody_bytes_sent(String body_bytes_sent) { |
59 |
this .body_bytes_sent = body_bytes_sent; |
60 |
} |
61 |
public String getHttp_referer() { |
62 |
return http_referer; |
63 |
} |
64 |
public void setHttp_referer(String http_referer) { |
65 |
this .http_referer = http_referer; |
66 |
} |
67 |
public String getHttp_user_agent() { |
68 |
return http_user_agent; |
69 |
} |
70 |
public void setHttp_user_agent(String http_user_agent) { |
71 |
this .http_user_agent = http_user_agent; |
72 |
} |
73 |
@Override |
74 |
public String toString() { |
75 |
return "Kpi [remote_addr=" + remote_addr + ", remote_user=" |
76 |
+ remote_user + ", time_local=" + time_local + ", request=" |
77 |
+ request + ", status=" + status + ", body_bytes_sent=" |
78 |
+ body_bytes_sent + ", http_referer=" + http_referer |
79 |
+ ", http_user_agent=" + http_user_agent + ", method=" + method |
80 |
+ ", http_version=" + http_version + "]" ; |
81 |
} |
82 |
|
83 |
|
84 |
|
85 |
} |
01 |
package org.aaa.kpi; |
02 |
|
03 |
public class KpiUtil { |
04 |
/*** |
05 |
* line記錄轉化成kpi對象 |
06 |
* @param line 日誌的一條記錄 |
07 |
* @author tianbx |
08 |
* */ |
09 |
public static Kpi transformLineKpi(String line){ |
10 |
String[] elementList = line.split( " " ); |
11 |
Kpi kpi = new Kpi(); |
12 |
kpi.setRemote_addr(elementList[ 0 ]); |
13 |
kpi.setRemote_user(elementList[ 1 ]); |
14 |
kpi.setTime_local(elementList[ 3 ].substring( 1 )); |
15 |
kpi.setMethod(elementList[ 5 ].substring( 1 )); |
16 |
kpi.setRequest(elementList[ 6 ]); |
17 |
kpi.setHttp_version(elementList[ 7 ]); |
18 |
kpi.setStatus(elementList[ 8 ]); |
19 |
kpi.setBody_bytes_sent(elementList[ 9 ]); |
20 |
kpi.setHttp_referer(elementList[ 10 ]); |
21 |
kpi.setHttp_user_agent(elementList[ 11 ] + " " + elementList[ 12 ]); |
22 |
return kpi; |
23 |
} |
24 |
} |
六、算法模型: 並行算法
Browser: 用戶的訪問設備統計
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)}
七、map-reduce分析代碼
01 |
import java.io.IOException; |
02 |
import java.util.Iterator; |
03 |
|
04 |
import org.apache.hadoop.fs.Path; |
05 |
import org.apache.hadoop.io.IntWritable; |
06 |
import org.apache.hadoop.io.Text; |
07 |
import org.apache.hadoop.mapred.FileInputFormat; |
08 |
import org.apache.hadoop.mapred.FileOutputFormat; |
09 |
import org.apache.hadoop.mapred.JobClient; |
10 |
import org.apache.hadoop.mapred.JobConf; |
11 |
import org.apache.hadoop.mapred.MapReduceBase; |
12 |
import org.apache.hadoop.mapred.Mapper; |
13 |
import org.apache.hadoop.mapred.OutputCollector; |
14 |
import org.apache.hadoop.mapred.Reducer; |
15 |
import org.apache.hadoop.mapred.Reporter; |
16 |
import org.apache.hadoop.mapred.TextInputFormat; |
17 |
import org.apache.hadoop.mapred.TextOutputFormat; |
18 |
import org.hmahout.kpi.entity.Kpi; |
19 |
import org.hmahout.kpi.util.KpiUtil; |
20 |
|
21 |
import cz.mallat.uasparser.UASparser; |
22 |
import cz.mallat.uasparser.UserAgentInfo; |
23 |
|
24 |
public class KpiBrowserSimpleV { |
25 |
|
26 |
public static class KpiBrowserSimpleMapper extends MapReduceBase |
27 |
implements Mapper<Object, Text, Text, IntWritable> { |
28 |
UASparser parser = null ; |
29 |
@Override |
30 |
public void map(Object key, Text value, |
31 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
32 |
throws IOException { |
33 |
Kpi kpi = KpiUtil.transformLineKpi(value.toString()); |
34 |
|
35 |
if (kpi!= null && kpi.getHttP_user_agent_info()!= null ){ |
36 |
if (parser== null ){ |
37 |
parser = new UASparser(); |
38 |
} |
39 |
UserAgentInfo info = |
40 |
parser.parseBrowserOnly(kpi.getHttP_user_agent_info()); |
41 |
if ( "unknown" .equals(info.getUaName())){ |
42 |
out.collect( new Text(info.getUaName()), new IntWritable( 1 )); |
43 |
} else { |
44 |
out.collect( new Text(info.getUaFamily()), new IntWritable( 1 )); |
45 |
} |
46 |
|
47 |
} |
48 |
} |
49 |
} |
50 |
|
51 |
public static class KpiBrowserSimpleReducer extends MapReduceBase implements |
52 |
Reducer<Text, IntWritable, Text, IntWritable>{ |
53 |
|
54 |
@Override |
55 |
public void reduce(Text key, Iterator<IntWritable> value, |
56 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
57 |
throws IOException { |
58 |
IntWritable sum = new IntWritable( 0 ); |
59 |
while (value.hasNext()){ |
60 |
sum.set(sum.get()+value.next().get()); |
61 |
} |
62 |
out.collect(key, sum); |
63 |
} |
64 |
} |
65 |
public static void main(String[] args) throws IOException { |
66 |
String input = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/input" ; |
67 |
String output = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/browerSimpleV" ; |
68 |
JobConf conf = new JobConf(KpiBrowserSimpleV. class ); |
69 |
conf.setJobName( "KpiBrowserSimpleV" ); |
70 |
String url = "classpath:" ; |
71 |
conf.addResource(url+ "/hadoop/core-site.xml" ); |
72 |
conf.addResource(url+ "/hadoop/hdfs-site.xml" ); |
73 |
conf.addResource(url+ "/hadoop/mapred-site.xml" ); |
74 |
|
75 |
conf.setMapOutputKeyClass(Text. class ); |
76 |
conf.setMapOutputValueClass(IntWritable. class ); |
77 |
|
78 |
conf.setOutputKeyClass(Text. class ); |
79 |
conf.setOutputValueClass(IntWritable. class ); |
80 |
|
81 |
conf.setMapperClass(KpiBrowserSimpleMapper. class ); |
82 |
conf.setCombinerClass(KpiBrowserSimpleReducer. class ); |
83 |
conf.setReducerClass(KpiBrowserSimpleReducer. class ); |
84 |
|
85 |
conf.setInputFormat(TextInputFormat. class ); |
86 |
conf.setOutputFormat(TextOutputFormat. class ); |
87 |
|
88 |
FileInputFormat.setInputPaths(conf, new Path(input)); |
89 |
FileOutputFormat.setOutputPath(conf, new Path(output)); |
90 |
|
91 |
JobClient.runJob(conf); |
92 |
System.exit( 0 ); |
93 |
} |
94 |
|
95 |
} |
八、輸出文件log_kpi/browerSimpleV內容
AOL Explorer 1
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685
8 R製做圖片
data<-read.table(file="borwer.txt",header=FALSE,sep=",")
names(data)<-c("borwer","num")
qplot(borwer,num,data=data,geom="bar")
解決這個問題
一、排除爬蟲和程序點擊,對抗做弊
解決的方法:頁面作個檢測鼠標是否動。
二、瀏覽量 怎麼排除圖片
三、瀏覽量排除假點擊?
四、哪個搜索引擎訪問的?
五、點擊哪個關鍵字訪問的?
六、從哪個地方訪問的?
七、使用哪個瀏覽器訪問的?