1、項目要求 前端
- 本文討論的日誌處理方法中的日誌,僅指Web日誌。事實上並無精確的定義,可能包含但不限於各類前端Webserver——apache、lighttpd、nginx、tomcat等產生的用戶訪問日誌,以及各類Web應用程序本身輸出的日誌。
2、需求分析: KPI指標設計 java
PV(PageView): 頁面訪問量統計
IP: 頁面獨立IP的訪問量統計
Time: 用戶每小時PV的統計
Source: 用戶來源域名的統計
Browser: 用戶的訪問設備統計 nginx
如下我着重分析瀏覽器統計 angularjs
3、分析過程 算法
一、 日誌的一條nginx記錄內容 apache
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939
"http://www.angularjs.cn/A00n"
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 瀏覽器
二、對上面的日誌記錄進行分析 tomcat
remote_addr : 記錄client的ip地址, 222.68.172.190
remote_user : 記錄clientusername稱, –
time_local: 記錄訪問時間與時區, [18/Sep/2013:06:49:57 +0000]
request: 記錄請求的url與http協議, 「GET /images/my.jpg HTTP/1.1″
status: 記錄請求狀態,成功是200, 200
body_bytes_sent: 記錄發送給client文件主體內容大小, 19939
http_referer: 用來記錄從那個頁面連接訪問過來的, 「http://www.angularjs.cn/A00n」
http_user_agent: 記錄客戶瀏覽器的相關信息, 「Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″ app
三、java語言分析上面一條日誌記錄(使用空格切分) ide
1 |
String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"" ; |
2 |
String[] elementList = line.split( " " ); |
3 |
for ( int i= 0 ;i<elementList.length;i++){ |
4 |
System.out.println(i+ " : " +elementList[i]); |
測試結果:
04 |
3 : [ 18 /Sep/ 2013 : 06 : 49 : 57 |
11 |
10 : "http://www.angularjs.cn/A00n" |
16 |
15 : AppleWebKit/ 537.36 |
20 |
19 : Chrome/ 29.0 . 1547.66 |
四、實體Kpi類的代碼:
02 |
private String remote_addr; |
03 |
private String remote_user; |
04 |
private String time_local; |
05 |
private String request; |
06 |
private String status; |
07 |
private String body_bytes_sent; |
08 |
private String http_referer; |
09 |
private String http_user_agent; |
10 |
private String method; |
11 |
private String http_version; |
13 |
public String getMethod() { |
16 |
public void setMethod(String method) { |
19 |
public String getHttp_version() { |
22 |
public void setHttp_version(String http_version) { |
23 |
this .http_version = http_version; |
25 |
public String getRemote_addr() { |
28 |
public void setRemote_addr(String remote_addr) { |
29 |
this .remote_addr = remote_addr; |
31 |
public String getRemote_user() { |
34 |
public void setRemote_user(String remote_user) { |
35 |
this .remote_user = remote_user; |
37 |
public String getTime_local() { |
40 |
public void setTime_local(String time_local) { |
41 |
this .time_local = time_local; |
43 |
public String getRequest() { |
46 |
public void setRequest(String request) { |
47 |
this .request = request; |
49 |
public String getStatus() { |
52 |
public void setStatus(String status) { |
55 |
public String getBody_bytes_sent() { |
56 |
return body_bytes_sent; |
58 |
public void setBody_bytes_sent(String body_bytes_sent) { |
59 |
this .body_bytes_sent = body_bytes_sent; |
61 |
public String getHttp_referer() { |
64 |
public void setHttp_referer(String http_referer) { |
65 |
this .http_referer = http_referer; |
67 |
public String getHttp_user_agent() { |
68 |
return http_user_agent; |
70 |
public void setHttp_user_agent(String http_user_agent) { |
71 |
this .http_user_agent = http_user_agent; |
74 |
public String toString() { |
75 |
return "Kpi [remote_addr=" + remote_addr + ", remote_user=" |
76 |
+ remote_user + ", time_local=" + time_local + ", request=" |
77 |
+ request + ", status=" + status + ", body_bytes_sent=" |
78 |
+ body_bytes_sent + ", http_referer=" + http_referer |
79 |
+ ", http_user_agent=" + http_user_agent + ", method=" + method |
80 |
+ ", http_version=" + http_version + "]" ; |
五、kpi的工具類
03 |
public class KpiUtil { |
09 |
public static Kpi transformLineKpi(String line){ |
10 |
String[] elementList = line.split( " " ); |
12 |
kpi.setRemote_addr(elementList[ 0 ]); |
13 |
kpi.setRemote_user(elementList[ 1 ]); |
14 |
kpi.setTime_local(elementList[ 3 ].substring( 1 )); |
15 |
kpi.setMethod(elementList[ 5 ].substring( 1 )); |
16 |
kpi.setRequest(elementList[ 6 ]); |
17 |
kpi.setHttp_version(elementList[ 7 ]); |
18 |
kpi.setStatus(elementList[ 8 ]); |
19 |
kpi.setBody_bytes_sent(elementList[ 9 ]); |
20 |
kpi.setHttp_referer(elementList[ 10 ]); |
21 |
kpi.setHttp_user_agent(elementList[ 11 ] + " " + elementList[ 12 ]); |
六、算法模型: 並行算法
Browser: 用戶的訪問設備統計
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)}
七、map-reduce分析代碼
01 |
import java.io.IOException; |
02 |
import java.util.Iterator; |
04 |
import org.apache.hadoop.fs.Path; |
05 |
import org.apache.hadoop.io.IntWritable; |
06 |
import org.apache.hadoop.io.Text; |
07 |
import org.apache.hadoop.mapred.FileInputFormat; |
08 |
import org.apache.hadoop.mapred.FileOutputFormat; |
09 |
import org.apache.hadoop.mapred.JobClient; |
10 |
import org.apache.hadoop.mapred.JobConf; |
11 |
import org.apache.hadoop.mapred.MapReduceBase; |
12 |
import org.apache.hadoop.mapred.Mapper; |
13 |
import org.apache.hadoop.mapred.OutputCollector; |
14 |
import org.apache.hadoop.mapred.Reducer; |
15 |
import org.apache.hadoop.mapred.Reporter; |
16 |
import org.apache.hadoop.mapred.TextInputFormat; |
17 |
import org.apache.hadoop.mapred.TextOutputFormat; |
18 |
import org.hmahout.kpi.entity.Kpi; |
19 |
import org.hmahout.kpi.util.KpiUtil; |
21 |
import cz.mallat.uasparser.UASparser; |
22 |
import cz.mallat.uasparser.UserAgentInfo; |
24 |
public class KpiBrowserSimpleV { |
26 |
public static class KpiBrowserSimpleMapper extends MapReduceBase |
27 |
implements Mapper<Object, Text, Text, IntWritable> { |
28 |
UASparser parser = null ; |
30 |
public void map(Object key, Text value, |
31 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
33 |
Kpi kpi = KpiUtil.transformLineKpi(value.toString()); |
35 |
if (kpi!= null && kpi.getHttP_user_agent_info()!= null ){ |
37 |
parser = new UASparser(); |
40 |
parser.parseBrowserOnly(kpi.getHttP_user_agent_info()); |
41 |
if ( "unknown" .equals(info.getUaName())){ |
42 |
out.collect( new Text(info.getUaName()), new IntWritable( 1 )); |
44 |
out.collect( new Text(info.getUaFamily()), new IntWritable( 1 )); |
51 |
public static class KpiBrowserSimpleReducer extends MapReduceBase implements |
52 |
Reducer<Text, IntWritable, Text, IntWritable>{ |
55 |
public void reduce(Text key, Iterator<IntWritable> value, |
56 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
58 |
IntWritable sum = new IntWritable( 0 ); |
59 |
while (value.hasNext()){ |
60 |
sum.set(sum.get()+value.next().get()); |
62 |
out.collect(key, sum); |
65 |
public static void main(String[] args) throws IOException { |
66 |
String input = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/input" ; |
67 |
String output = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/browerSimpleV" ; |
68 |
JobConf conf = new JobConf(KpiBrowserSimpleV. class ); |
69 |
conf.setJobName( "KpiBrowserSimpleV" ); |
70 |
String url = "classpath:" ; |
71 |
conf.addResource(url+ "/hadoop/core-site.xml" ); |
72 |
conf.addResource(url+ "/hadoop/hdfs-site.xml" ); |
73 |
conf.addResource(url+ "/hadoop/mapred-site.xml" ); |
75 |
conf.setMapOutputKeyClass(Text. class ); |
76 |
conf.setMapOutputValueClass(IntWritable. class ); |
78 |
conf.setOutputKeyClass(Text. class ); |
79 |
conf.setOutputValueClass(IntWritable. class ); |
81 |
conf.setMapperClass(KpiBrowserSimpleMapper. class ); |
82 |
conf.setCombinerClass(KpiBrowserSimpleReducer. class ); |
83 |
conf.setReducerClass(KpiBrowserSimpleReducer. class ); |
85 |
conf.setInputFormat(TextInputFormat. class ); |
86 |
conf.setOutputFormat(TextOutputFormat. class ); |
88 |
FileInputFormat.setInputPaths(conf, new Path(input)); |
89 |
FileOutputFormat.setOutputPath(conf, new Path(output)); |
91 |
JobClient.runJob(conf); |
八、輸出文件log_kpi/browerSimpleV內容
AOL Explorer 1
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685
8 R製做圖片
data<-read.table(file="borwer.txt",header=FALSE,sep=",")
names(data)<-c("borwer","num")
qplot(borwer,num,data=data,geom="bar")
解決這個問題
一、排除爬蟲和程序點擊,對抗做弊
解決的方法:頁面作個檢測鼠標是否動。
二、瀏覽量 怎麼排除圖片
三、瀏覽量排除假點擊?
四、哪個搜索引擎訪問的?
五、點擊哪個keyword訪問的?
六、從哪個地方訪問的?
七、使用哪個瀏覽器訪問的?