即指用戶訪問網站時的全部訪問、瀏覽、點擊行爲數據。好比點擊了哪個連接,在哪一個網頁停留時間最多,採用了哪一個搜索項、整體瀏覽時間等。而全部這些信息均可被保存在網站日誌中。經過分析這些數據,能夠獲知許多對網站運營相當重要的信息。採集的數據越全面,分析就能越精準。css
日誌的生成渠道:
1.是網站的web服務器所記錄的web訪問日誌
2.是經過在頁面嵌入自定義的js代碼來獲取用戶的全部訪問行爲(好比鼠標懸停的位置,點擊的頁面組件等),而後經過ajax請求到後臺記錄日誌;這種方式所能採集的信息最全面;
3.經過在頁面上埋點1像素的圖片,將相關頁面訪問信息請求到後臺記錄日誌;java
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)" 183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] "-" 400 0 "-" "-" 163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
時間戳 | IP地址 | Cookie | Session | 請求URL | Referal |
---|---|---|---|---|---|
2012-01-01 12:31:12 | 101.0.0.1 | User01 | S001 | /a/... | somesite.com |
2012-01-01 12:31:16 | 201.0.0.2 | User02 | S002 | /a/... | - |
2012-01-01 12:33:06 | 101.0.0.2 | User03 | S002 | /b/... | baidu.com |
2012-01-01 15:16:39 | 234.0.0.3 | User01 | S003 | /c/... | google.com |
2012-01-01 15:17:11 | 101.0.0.1 | User01 | S004 | /d/... | /c/... |
2012-01-01 15:19:23 | 101.0.0.1 | User01 | S004 | /e/... | /d/.... |
3.根據後續的統計需求,過濾分離出各類不一樣主題(不一樣欄目path)的基礎數據git
/** * 對接外部數據的層,表結構定義最好跟外部數據源保持一致 * 術語:貼源表 * @author * */ public class WebLogBean implements Writable { private boolean valid = true; // 判斷數據是否合法 private String remote_addr; // 記錄客戶端的ip地址 private String remote_user; // 記錄客戶端用戶名稱,忽略屬性"-" private String time_local; // 記錄訪問時間與時區 private String request; // 記錄請求的url與http協議 private String status; // 記錄請求狀態;成功是200 private String body_bytes_sent; // 記錄發送給客戶端文件主體內容大小 private String http_referer; // 用來記錄從那個頁面連接訪問過來的 private String http_user_agent; // 記錄客戶瀏覽器的相關信息 public void set(boolean valid,String remote_addr, String remote_user, String time_local, String request, String status, String body_bytes_sent, String http_referer, String http_user_agent) { this.valid = valid; this.remote_addr = remote_addr; this.remote_user = remote_user; this.time_local = time_local; this.request = request; this.status = status; this.body_bytes_sent = body_bytes_sent; this.http_referer = http_referer; this.http_user_agent = http_user_agent; } public String getRemote_addr() { return remote_addr; } public void setRemote_addr(String remote_addr) { this.remote_addr = remote_addr; } public String getRemote_user() { return remote_user; } public void setRemote_user(String remote_user) { this.remote_user = remote_user; } public String getTime_local() { return this.time_local; } public void setTime_local(String time_local) { this.time_local = time_local; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getBody_bytes_sent() { return body_bytes_sent; } public void setBody_bytes_sent(String body_bytes_sent) { this.body_bytes_sent = body_bytes_sent; } public String getHttp_referer() { return http_referer; } public void setHttp_referer(String http_referer) { this.http_referer = http_referer; } public String getHttp_user_agent() { return http_user_agent; } public void setHttp_user_agent(String http_user_agent) { this.http_user_agent = http_user_agent; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(this.valid); sb.append("\001").append(this.getRemote_addr()); sb.append("\001").append(this.getRemote_user()); sb.append("\001").append(this.getTime_local()); sb.append("\001").append(this.getRequest()); sb.append("\001").append(this.getStatus()); sb.append("\001").append(this.getBody_bytes_sent()); sb.append("\001").append(this.getHttp_referer()); sb.append("\001").append(this.getHttp_user_agent()); return sb.toString(); } @Override public void readFields(DataInput in) throws IOException { this.valid = in.readBoolean(); this.remote_addr = in.readUTF(); this.remote_user = in.readUTF(); this.time_local = in.readUTF(); this.request = in.readUTF(); this.status = in.readUTF(); this.body_bytes_sent = in.readUTF(); this.http_referer = in.readUTF(); this.http_user_agent = in.readUTF(); } @Override public void write(DataOutput out) throws IOException { out.writeBoolean(this.valid); out.writeUTF(null==remote_addr?"":remote_addr); out.writeUTF(null==remote_user?"":remote_user); out.writeUTF(null==time_local?"":time_local); out.writeUTF(null==request?"":request); out.writeUTF(null==status?"":status); out.writeUTF(null==body_bytes_sent?"":body_bytes_sent); out.writeUTF(null==http_referer?"":http_referer); out.writeUTF(null==http_user_agent?"":http_user_agent); } }
public class WebLogParser { static SimpleDateFormat df1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); static SimpleDateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US); public static WebLogBean parser(String line){ WebLogBean WebLogBean = new WebLogBean(); String[] arr = line.split(" "); if (arr.length > 11){ WebLogBean.setRemote_addr(arr[0]); WebLogBean.setRemote_user(arr[1]); String time_local = formatDate(arr[3].substring(1)); if(null==time_local) time_local="-invalid_time-"; WebLogBean.setTime_local(time_local); WebLogBean.setRequest(arr[6]); WebLogBean.setStatus(arr[8]); WebLogBean.setBody_bytes_sent(arr[9]); WebLogBean.setHttp_referer(arr[10]); //若是useragent元素較多,拼接useragent if (arr.length > 12){ StringBuilder sb = new StringBuilder(); for (int i=11; i<arr.length;i++){ sb.append(arr[i]); } WebLogBean.setHttp_user_agent(sb.toString()); } else { WebLogBean.setHttp_user_agent(arr[11]); } // 大於400,HTTP錯誤 if (Integer.parseInt(WebLogBean.getStatus()) >= 400){ WebLogBean.setValid(false); } if("-invalid_time-".equals(WebLogBean.getTime_local())){ WebLogBean.setValid(false); } } else { WebLogBean.setValid(false); } return WebLogBean; } public static void filtStaticResource(WebLogBean bean,Set<String> pages) { if (!pages.contains(bean.getRequest())) { bean.setValid(false); } } //更換日期的顯示格式 public static String formatDate(String time_local){ try { return df2.format(df1.parse(time_local)); } catch (ParseException e) { return null; } } }
/** * 處理原始日誌,過濾出真實pv請求 * 轉換時間格式 * 對缺失字段填充默認值 * 對記錄標記valid和invalid * * @author * */ public class WeblogPreProcess { static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ //用來存儲網站url分類數據 Set<String>pages=new HashSet<String>(); Text k =new Text(); NullWritable v = NullWritable.get(); /** * 從外部加載網站url分類數據 */ @Override protected void setup(Context context) throws IOException, InterruptedException { pages.add("/about"); pages.add("/black-ip-list/"); pages.add("/cassandra-clustor/"); pages.add("/finance-rhive-repurchase/"); pages.add("/hadoop-family-roadmap/"); pages.add("/hadoop-hive-intro/"); pages.add("/hadoop-zookeeper-intro/"); pages.add("/hadoop-mahout-roadmap/"); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); WebLogBean WebLogBean = WebLogParser.parser(line); //過濾 js/圖片/css 等靜態資源 WebLogParser.filtStaticResource(WebLogBean, pages); /** if (!WebLogBean.isValid()) return; */ k.set(WebLogBean.toString()); context.write(k, v); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(); job.setJarByClass(WeblogPreProcess.class); job.setMapperClass(WeblogPreProcessMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //FileInputFormat.setInputPaths(job, new Path(args[0])); //FileOutputFormat.setOutputPath(job, new Path(args[1])); FileInputFormat.setInputPaths(job, new Path("E:/srcdata/webLog/input")); FileOutputFormat.setOutputPath(job, new Path("E:/srcdata/webLog/output")); boolean res = job.waitForCompletion(true); System.exit(res?0:1); } }
true1.80.249.223-2013-09-18 07:57:33/hadoop-hive-intro/20014764"http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt""Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0" true101.226.167.201-2013-09-18 09:30:36/hadoop-mahout-roadmap/20010335"http://blog.fens.me/hadoop-mahout-roadmap/""Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" true101.226.167.205-2013-09-18 09:30:32/hadoop-family-roadmap/20011715"http://blog.fens.me/hadoop-family-roadmap/""Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" true101.226.169.215-2013-09-18 10:07:31/about3015"http://blog.fens.me/about""Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider"
文章到這裏就完成了,謝謝觀看。web