大數據教程(10.5)運營商流量日誌解析加強

    上一篇文章分析瞭如何使用hadoop來實現sql中group by 而且取每組中最大值的需求--訂單中成交金額最大的訂單項分析,本篇博客博主將繼續分享一個mapreduce實戰例子--運營商流量日誌解析加強;php

    1、需求html

           電信運營商服務器中記錄了用戶流量訪問的日誌,效果以下圖所示:java

1374609560.11	1374609560.16	1374609560.16	1374609560.16	110	5	8615038208365	460023383869133	8696420056841778	2	460	0	14615			54941	10.188.77.252	61.145.116.27	35020	80	6	cmnet	1	221.177.218.34	221.177.217.161	221.177.218.34	221.177.217.167	ad.veegao.com	http://ad.veegao.com/veegao/iris.action		Apache-HttpClient/UNAVAILABLE (java 1.4)	POST	200	593	310	4	3	0	0	4	3	0	0	0	0	http://ad.veegao.com/veegao/iris.action	5903903079251243019	5903903103500771339	5980728
1374609558.91	1374609558.97	1374609558.97	1374609559.31	112	461	8615038208365	460023383869133	8696420056841778	2	460	0	14615			54941	10.188.77.252	101.226.76.175	37293	80	6	cmnet	1	221.177.218.34	221.177.217.161	221.177.218.34	221.177.217.167	short.weixin.qq.com	http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns		Android QQMail HTTP Client	POST	200	543	563	2	3	0	0	2	3	0	0	0	0	http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns	5903903079251243019	5903903097240039435	5980728
1374609514.70	1374609514.75	1374609514.75	1374609515.58	110	5	8613674976196	460004901700207	8623350100353878	2	460	0	14694			58793	10.184.80.32	111.13.13.222	36181	80	6	cmnet	1	221.177.156.4	221.177.217.145	221.177.156.4	221.177.217.156	retype.wenku.bdimg.com	http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09		AMB_400	GET	200	345	4183	5	5	0	0	5	5	0	0	0	0	http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09	5903900710696611851	5903902908140003339	5937307

            咱們須要將其中的url若是已經在數據庫中有其對應的標籤內容,則加強日誌,在後面輸出其內容;不然,表示該url在數據字典中不存在,須要使用爬蟲去爬取;mysql

    2、代碼實現android

           DBLoader(db數據加載類)web

package com.empire.hadoop.mr.logenhance;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Map;

public class DBLoader {

    public static void dbLoader(Map<String, String> ruleMap) throws Exception {

        Connection conn = null;
        Statement st = null;
        ResultSet res = null;

        try {
            Class.forName("com.mysql.jdbc.Driver");
            conn = DriverManager.getConnection("jdbc:mysql://192.168.29.131:3306/urldb?characterEncoding=utf-8", "root",
                    "123456");
            st = conn.createStatement();
            res = st.executeQuery("select url,content from url_rule");
            while (res.next()) {
                ruleMap.put(res.getString(1), res.getString(2));
            }

        } finally {
            try {
                if (res != null) {
                    res.close();
                }
                if (st != null) {
                    st.close();
                }
                if (conn != null) {
                    conn.close();
                }

            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }
}

           LogEnhanceOutputFormat(自定義OutputFormat--用於將加強日誌和須要爬蟲爬取的url分文件輸出)正則表達式

package com.empire.hadoop.mr.logenhance;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * maptask或者reducetask在最終輸出時,先調用OutputFormat的getRecordWriter方法拿到一個RecordWriter
 * 而後再調用RecordWriter的write(k,v)方法將數據寫出
 */
public class LogEnhanceOutputFormat extends FileOutputFormat<Text, NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSystem fs = FileSystem.get(context.getConfiguration());

        Path enhancePath = new Path("/en/log.dat");
        Path tocrawlPath = new Path("/crw/url.dat");

        FSDataOutputStream enhancedOs = fs.create(enhancePath);
        FSDataOutputStream tocrawlOs = fs.create(tocrawlPath);

        return new EnhanceRecordWriter(enhancedOs, tocrawlOs);
    }

    /**
     * 構造一個本身的recordwriter
     * 
     * @author
     */
    static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
        FSDataOutputStream enhancedOs = null;
        FSDataOutputStream tocrawlOs  = null;

        public EnhanceRecordWriter(FSDataOutputStream enhancedOs, FSDataOutputStream tocrawlOs) {
            super();
            this.enhancedOs = enhancedOs;
            this.tocrawlOs = tocrawlOs;
        }

        @Override
        public void write(Text key, NullWritable value) throws IOException, InterruptedException {
            String result = key.toString();
            // 若是要寫出的數據是待爬的url,則寫入待爬清單文件 /logenhance/tocrawl/url.dat
            if (result.contains("tocrawl")) {
                tocrawlOs.write(result.getBytes());
            } else {
                // 若是要寫出的數據是加強日誌,則寫入加強日誌文件 /logenhance/enhancedlog/log.dat
                enhancedOs.write(result.getBytes());
            }

        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            if (tocrawlOs != null) {
                tocrawlOs.close();
            }
            if (enhancedOs != null) {
                enhancedOs.close();
            }

        }

    }

}

           LogEnhance(日誌分析加強主程序類)sql

package com.empire.hadoop.mr.logenhance;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogEnhance {

    static class LogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

        Map<String, String> ruleMap = new HashMap<String, String>();

        Text                k       = new Text();
        NullWritable        v       = NullWritable.get();

        // 從數據庫中加載規則信息倒ruleMap中
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

            try {
                DBLoader.dbLoader(ruleMap);
            } catch (Exception e) {
                e.printStackTrace();
            }

        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 獲取一個計數器用來記錄不合法的日誌行數, 組名, 計數器名稱
            Counter counter = context.getCounter("malformed", "malformedline");
            String line = value.toString();
            String[] fields = StringUtils.split(line, "\t");
            try {
                String url = fields[26];
                if (isHttpUrl(url)) {
                    String content_tag = ruleMap.get(url);
                    // 判斷內容標籤是否爲空,若是爲空,則只輸出url到待爬清單;若是有值,則輸出到加強日誌
                    if (content_tag == null) {
                        k.set(url + "\t" + "tocrawl" + "\n");
                        context.write(k, v);
                    } else {
                        k.set(line + "\t" + content_tag + "\n");
                        context.write(k, v);
                    }
                }
            } catch (Exception exception) {
                counter.increment(1);
            }
        }

        /**
         * 判斷字符串是否爲URL
         * 
         * @param urls
         * @return true:是URL、false:不是URL
         */
        public boolean isHttpUrl(String urls) {
            boolean isurl = false;
            String regex = "(((https|http)?://)?([a-z0-9]+[.])|(www.))"
                    + "\\w+[.|\\/]([a-z0-9]{0,})?[[.]([a-z0-9]{0,})]+((/[\\S&&[^,;\u4E00-\u9FA5]]+)+)?([.][a-z0-9]{0,}+|/?)";//設置正則表達式

            Pattern pat = Pattern.compile(regex.trim());//比對
            Matcher mat = pat.matcher(urls.trim());
            isurl = mat.matches();//判斷是否匹配
            if (isurl) {
                isurl = true;
            }
            return isurl;
        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(LogEnhance.class);

        job.setMapperClass(LogEnhanceMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 要控制不一樣的內容寫往不一樣的目標路徑,能夠採用自定義outputformat的方法
        job.setOutputFormatClass(LogEnhanceOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        // 儘管咱們用的是自定義outputformat,可是它是繼承製fileoutputformat
        // 在fileoutputformat中,必須輸出一個_success文件,因此在此還須要設置輸出path
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 不須要reducer
        job.setNumReduceTasks(0);

        job.waitForCompletion(true);
        System.exit(0);

    }

}

    3、數據庫字典數據準備數據庫

           建立表:      apache

DROP TABLE IF EXISTS `url_rule`;
CREATE TABLE `url_rule` (
  `url` varchar(2000) DEFAULT NULL,
  `content` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

           導入數據:

INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609614&di=7cec4b45b8d4db319556ad87166932d5&src=http://i1.baidu.com/it/u=975390796,1697384219&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=69f7930c9cc2938e9fb9a09f815b78b8&src=http://i1.baidu.com/it/u=2694289975,690736961&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=9998b896a653eae8a998dbeed5bce6e7&src=http://i1.baidu.com/it/u=433837882,4029071921&fm=21&gp=0.jpg', 'somecontent');
......

    4、運行程序

#上傳jar

Alt+p
lcd d:/
 put loge.jar 2013072404-http-combinedBy-1373892200521-log-1.log

#準備hadoop處理的數據文件

cd /home/hadoop/apps/hadoop-2.9.1
hadoop fs  -mkdir -p /loge/input
hdfs dfs -put  2013072404-http-combinedBy-1373892200521-log-1.log /loge/input

#運行rjoin程序

hadoop jar loge.jar  com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output

    5、運行效果

[hadoop@centos-aaron-h1 ~]$  hadoop jar loge.jar  com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output
18/12/23 23:48:43 INFO client.RMProxy: Connecting to ResourceManager at centos-aaron-h1/192.168.29.144:8032
18/12/23 23:48:44 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/12/23 23:48:44 INFO input.FileInputFormat: Total input files to process : 1
18/12/23 23:48:44 INFO mapreduce.JobSubmitter: number of splits:1
18/12/23 23:48:45 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
18/12/23 23:48:45 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1545579497045_0001
18/12/23 23:48:46 INFO impl.YarnClientImpl: Submitted application application_1545579497045_0001
18/12/23 23:48:46 INFO mapreduce.Job: The url to track the job: http://centos-aaron-h1:8088/proxy/application_1545579497045_0001/
18/12/23 23:48:46 INFO mapreduce.Job: Running job: job_1545579497045_0001
18/12/23 23:48:59 INFO mapreduce.Job: Job job_1545579497045_0001 running in uber mode : false
18/12/23 23:48:59 INFO mapreduce.Job:  map 0% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job:  map 100% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job: Job job_1545579497045_0001 completed successfully
18/12/23 23:49:15 INFO mapreduce.Job: Counters: 31
        File System Counters
                FILE: Number of bytes read=0
                FILE: Number of bytes written=196971
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=61826403
                HDFS: Number of bytes written=9735615
                HDFS: Number of read operations=3
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=11274
                Total time spent by all reduces in occupied slots (ms)=0
                Total time spent by all map tasks (ms)=11274
                Total vcore-milliseconds taken by all map tasks=11274
                Total megabyte-milliseconds taken by all map tasks=11544576
        Map-Reduce Framework
                Map input records=100064
                Map output records=85730
                Input split bytes=154
                Spilled Records=0
                Failed Shuffles=0
                Merged Map outputs=0
                GC time elapsed (ms)=356
                CPU time spent (ms)=3680
                Physical memory (bytes) snapshot=114851840
                Virtual memory (bytes) snapshot=846995456
                Total committed heap usage (bytes)=16556032
        malformed
                malformedline=1
        File Input Format Counters 
                Bytes Read=61826249
        File Output Format Counters 
                Bytes Written=9735615
[hadoop@centos-aaron-h1 ~]$

    6、運行結果

[hadoop@centos-aaron-h1 ~]$hdfs dfs -cat /en/log.dat
1374609375.94   1374609375.95   1374609375.99   1374609378.08   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4!    m.qpic.cn       android-qzone      GET     200     705     11884   10      9       0       0       10      9       0       0       0       0       http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609386.06   1374609386.07   1374609406.25   1374609406.88   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw!m.qpic.cn        android-qzone   GET     200     2008    20432   16      18      0       0       16      18      3       8       0       0       http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw! 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609407.48   1374609407.49   1374609410.03   1374609411.86   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc!    m.qpic.cn       android-qzone      GET     200     1061    19785   18      16      0       0       18      16      0       0       0       0       http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609411.98   1374609411.99   1374609412.01   1374609413.90   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA      m.qpic.cn  android-qzone   GET     200     559     7738    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609414.02   1374609414.03   1374609414.05   1374609416.90   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA&     m.qpic.cn  android-qzone   GET     200     600     8304    7       6       0       0       7       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA&        5903901810496765953     5903902282558234625     1495263367      somecontent
1374609417.14   1374609417.15   1374609417.18   1374609419.12   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA      m.qpic.cn  android-qzone   GET     200     559     8072    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609470.46   1374609470.47   1374609470.51   1374609473.60   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc!    m.qpic.cn       android-qzone      GET     200     705     11551   10      9       0       0       10      9       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609473.80   1374609473.81   1374609473.82   1374609475.40   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc!    m.qpic.cn       android-qzone      GET     200     465     5664    4       5       0       0       4       5       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609475.58   1374609475.59   1374609475.61   1374609477.02   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc!    m.qpic.cn       android-qzone      GET     200     505     6009    5       5       0       0       5       5       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609477.18   1374609477.19   1374609477.21   1374609480.38   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     7400    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609480.56   1374609480.57   1374609480.60   1374609482.26   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9028    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609483.20   1374609483.21   1374609483.24   1374609485.44   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     585     8554    7       7       0       0       7       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609485.62   1374609485.63   1374609485.66   1374609487.84   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9332    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609488.02   1374609488.03   1374609488.06   1374609489.98   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9151    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609501.12   1374609501.13   1374609502.91   1374609505.50   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc!    m.qpic.cn       android-qzone      GET     200     785     14556   12      11      0       0       12      11      0       0       0       0       http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609547.42   1374609547.43   1374609547.44   1374609549.00   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM!    m.qpic.cn       android-qzone      GET     200     1540    5450    8       7       0       0       8       7       3       0       0       0       http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609549.14   1374609549.15   1374609549.17   1374609551.20   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V12TDrbk3A3vMJ/aKcud*PHBeeGA6.xo4OMEipBTW3SyWZHFLb1N0ABA2s!/m/dHA8UcJ8JQAA&bo=5gGIAgAAAAABAcat: Filesystem closed
[hadoop@centos-aaron-h1 ~]$
[hadoop@centos-aaron-h1 ~]$  hdfs dfs -cat /crw/url.dat |more 
http://m.baidu.com/static/tf/nopic.gif?r=1374609403508&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp       tocrawl
http://m.baidu.com/static/tf/nopic.gif?r=1374609444479&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp       tocrawl
http://m.nuomi.com/client/push/list?cid=2000010000&devid=358059043449333&manufacturer=samsung&version=3.0.0&client=android&loc=MTEzLjY1NDQ0OTQ2Mjg5MDYzLDM0LjgwOTE4MTIxMzM3ODkwNg%3D%3D&uuid=ffffffff-db69-afec-4b6f-b9ab3c02bbbc&cityid=20
00010000&model=GT-I9228&userid=1345361458243220&channel=wooboo06.d8&release=2.3.6&mac=NTA6Q0M6Rjg6QTQ6MzQ6RjI%3D        tocrawl
http://m.baidu.com/ssid=0/from=2001m/bd_page_type=1/uid=1ADA21F27A014F2180A7E22E8BEE35B9/pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0/img?tn=bdwis&word=%E6%A8%B1%E4%BA%95%E8%8E%89%E4%BA%9A%E7%94%B5%E5%BD%B1%E6%88%AA%E5%9B%BE&p
n=0&dw=w320&bs=176_208&pos=0&pinf=12_6_0_@bdwis_@av%E7%94%B5%E5%BD%B1%E9%AB%98%E6%BD%AE%E6%88%AA%E5%9B%BE_@176_208_@w320&fm=rs2&sp=&mid=w320    tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&topicid=166025  tocrawl
http://hm.baidu.com/hm.gif?si=b7723ac5ec07c308ac1ddf314523c2b0&et=0&nv=0&st=4&lt=1374609378&su=http%3A%2F%2Ffang.xinzheng.cc%2F&u=http%3A%2F%2Fwww.xinzheng.cc%2Ffangwu%2Fxiezilou&v=tc-1.0&rnd=2142219798      tocrawl
http://az.tpwap.cn/config.jsp?pos=448&clientId=435&sid=c2_1.1.1 tocrawl
better01.sinaapp.com    tocrawl
better01.sinaapp.com    tocrawl
http://af.upsdk.com/af/appActive?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_channel=Lianxiang&af_project=LXF_AG790_A01&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&sid=1374609488731&brand=Lenovo&veri
fy_code=0ac0ba8ecb2a5455a052fa445c1937d5&encrpytion=DES&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000   tocrawl
http://af.upsdk.com/af/afPush?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_project=LXF_AG790_A01&af_channel=Lianxiang&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&imsi=FD8CD80070CD6E0F71D8FAC3B8BF4F73&
sdk_version=16&sdk_name=4.1.1&af_version=3.7.1&sid=1374609488711&brand=Lenovo&verify_code=838773006a2a84415a64dea4aa117113&encrpytion=DES&lac=14196&cid=57917&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000
        tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=354096050843721&imsi=460003891750725&net=&app_version=1.0.7&sdk_version=1.5.2&device_name=GT-S7562i&y=6a5434e15a787d6791343c83523df81b&device_type=andro
id&os_version=4.0.4&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=WAPS&device_width=480&device_height=800&at=1374600352535 tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=1492849784&src_uin=1665118942&fid=1492849784&spec=100&referer=mqq&term_type=pc&host=1&sign=0C82F9B2A7109F7C2C058C41FE6CB7810AC63C710B4A3474&rsp_type=img     tocrawl
better01.sinaapp.com    tocrawl
http://switching.atm.punchbox.org/v1/?appid=16252171-7DE8-159E-F72B-9A58CFD2D08D&ver=7.1.2      tocrawl
http://q1.qlogo.cn/g?b=mqq&k=S8TokNAI2eSw6kynuB2gjA&t=1374608380&refer=mqq&s=40 tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133    tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2569887365&src_uin=1665118942&fid=2569887365&spec=100&referer=mqq&term_type=pc&host=1&sign=5D6B95E75ACFC11355D4542D151A876C72D62301FCAEBFD3&rsp_type=img     tocrawl
better01.sinaapp.com    tocrawl
http://app.adsofts.cn/action/connect/active?app_id=3fe23c1a9f592ec42abf6a7d012841ba&udid=865662012462439&imsi=460003778631834&net=cmnet&base=adsofts.cn&app_version=3.7&sdk_version=1.6.10&device_name=TE600+&device_brand=Ctyon&y=2d81b1d1
1e11ae7de129d3cce29a958c&device_type=android&os_version=4.1.2&country_code=CN&language=zh&cid=83mdas8k2mj0g70r4b0b68ko9kgr2dk6&act=com.androidemu.harvechise.ReceiverRestrictedContext&channel=gfan&device_width=320&device_height=480&at=1
374609806979    tocrawl
better01.sinaapp.com    tocrawl
http://mb.hd.sohu.com.cn/mc.gif?uid=ff05360fb8fc15f1eb07c84904a9f863&url=1002&passport=&mtype=6&ltype=&cv=2.8.1&mos=2&mosv=4.2.2&pro=1&mfo=BBK&mfov=vivo%20Xplay&webtype=2G&vid=&time=1374609536953&memo=0&type=1&channelid=91&value=&sim=1
&playlistid=&catecode=&preid=&newuser=0&enterid=0&startid=1374609536961&loc=    tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133    tocrawl
http://wap.baidu.com/bd_page_type=1/pu=usm%400%2Csz%401330%5F640%2Cta%40big%5F%5F5%2E0%5F3%5F525/uid=FC69E3DEB768362786105AE0F78D77A7/t=wap/w=0_10_%E6%9E%81%E5%93%81%E5%A5%B3%E4%BB%99%E4%B9%A6%E5%8C%85%E7%BD%91%E9%98%85%E8%AF%BB/ssid=0
/from=128g/l=0/tc?func=nextp&pi=3&m=0&pn=15&src=http%3A%2F%2Fwww%2Ebookbao%2Ecom%2Fviews%2F201306%2F05%2Fid%5FXMzI4NDU2%5F14%2Ehtml     tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2251204892&src_uin=1665118942&fid=2251204892&spec=100&referer=mqq&term_type=pc&host=1&sign=CFAFF94D0282A4B4EBC81DBDB307CAD166B756E7433FC920&rsp_type=img     tocrawl
http://r3.11222.cn/resource/bookclub/cover/95/84/2011120917151262.jpg   tocrawl
http://m.i.ppsrc.com/d/82C09949DDA26192C2170EF4CB0A0084/640/640 tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=358864040778026&imsi=460003827213968&net=cmnet&app_version=1.0.2&sdk_version=1.5.2&device_name=GT-P6200&y=f34b836872c220f87c39d6bf6afeb123&device_type=a
ndroid&os_version=3.2&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=Samsung&device_width=600&device_height=976&at=1374609799140    tocrawl
better01.sinaapp.com    tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAXEPJgCAAAADAAAAUANHH1VAA0vAFhwaWQ9JmRpZD1PRFkyTlRFMk1ERTJOVE0yT1RRMiZzaWQ9TVVKRVJrTTNORFl6UlVFeVFUUXpOell5TVRFek56VkJSa1ExUTBWR056UXgmZHQ9YVcxbGFRK4GzrB5hOEri5Cx08JrOrw&ack=R1:
2106551173&min=180&max=320      tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&serialid=2573&level=1&pageindex=2&pagesize=20   tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2215216651&src_uin=1157251974&fid=2215216651&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img     tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=306486557&src_uin=1157251974&fid=306486557&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img       tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAIyZAkCAAAADAAAAT_pzFv2AA0vAFhwaWQ9JmRpZD1PRFk1TkRVNU1ERXlNVE16TWpFNSZzaWQ9UmpZNVJEa3hRMFkxTWpKQk56WXpNelkwUWpOQ1FUQkNSamhGTkRNMFJEY3gmZHQ9YVcxbGFRGaRu5QbEjFyJDcKU6sGCSg&ack=R1:
2005075059&min=300&max=300      tocrawl
http://api.changba.com/ktvbox.php?ac=pullnotice&macaddress=1C%3A66%3AAA%3A1A%3AF9%3A56&channelsrc=changba_A&deviceid=1C%3A66%3AAA%3A1A%3AF9%3A56&version=2.3.0&seret=f3f1470646&_userinfo=796   tocrawl
http://q3.qlogo.cn/g?b=mqq&k=9FqISKXvCKCf6sPAxeu8yA&t=1374446924&refer=mqq&s=100        tocrawl
http://q4.qlogo.cn/g?b=mqq&k=puGXojn2c7cOCeWibOce6yw&t=1374373655&refer=mqq&s=100       tocrawl
http://q1.qlogo.cn/g?b=mqq&k=ZRw4QCD4IoG0SsepbqNUfQ&t=1370823532&refer=mqq&s=100        tocrawl
http://a126.photo.store.qq.com/psb?/V10Z7w8y4FYPqA/zzUERobf01h9PPi7s*.q7SkeQsyEm0KTu1EfAYWWZ94!/a/dHi5G0t8JgAA&bo=yAALAQAAAAABAOU!      tocrawl
[hadoop@centos-aaron-h1 ~]$

    7、注意事項

           mapreduce代碼打包的時候注意須要將mysql的驅動Jar包打上,可使用maven插件,或者開發工具選取lib文件打包;

          最後寄語,以上是博主本次文章的所有內容,若是你們以爲博主的文章還不錯,請點贊;若是您對博主其它服務器大數據技術或者博主本人感興趣,請關注博主博客,而且歡迎隨時跟博主溝通交流。

相關文章
相關標籤/搜索