需求描述:java
從hdfs中獲取數據,字段url須要計算出url_type 經過進行hive的left outer join ,效率很是低。故將url的類型導入到hbase中,利用hbase快速查詢的特色,結合mapreduce進行字段打標。apache
剛開始的mapreduce程序以下:app
1 package com.bonc.db; 2 3 import java.io.IOException; 4 5 import org.apache.hadoop.conf.Configuration; 6 import org.apache.hadoop.fs.Path; 7 import org.apache.hadoop.hbase.client.Get; 8 import org.apache.hadoop.hbase.client.HTable; 9 import org.apache.hadoop.hbase.client.HTablePool; 10 import org.apache.hadoop.hbase.client.Result; 11 import org.apache.hadoop.io.LongWritable; 12 import org.apache.hadoop.io.Text; 13 import org.apache.hadoop.mapreduce.Job; 14 import org.apache.hadoop.mapreduce.Mapper; 15 import org.apache.hadoop.mapreduce.Reducer; 16 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 19 import com.bonc.URLMatch.HBaseMain; 20 21 public class DWA_S_D_USE_MB_COUNT_BASE2 { 22 public static void main(String args[]) throws Exception { 23 Configuration conf = new Configuration(); 24 Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE"); 25 job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE2.class); 26 job.setMapperClass(DataCleanMapper.class); 27 job.setReducerClass(DataCleanReduce.class); 28 job.setNumReduceTasks(150); 29 job.setOutputKeyClass(Text.class); 30 job.setOutputValueClass(Text.class); 31 job.setMapOutputKeyClass(Text.class); 32 job.setMapOutputValueClass(Text.class); 33 FileInputFormat.addInputPath(job, new Path(args[0])); 34 FileOutputFormat.setOutputPath(job, new Path(args[1])); 35 System.exit(job.waitForCompletion(true) ? 0 : 1); 36 } 37 38 public static class DataCleanMapper extends 39 Mapper<LongWritable, Text, Text, Text> { 40 @Override 41 protected void map(LongWritable key, Text value, Context context) 42 throws IOException, InterruptedException { 43 String lines = value.toString(); 44 String[] strs = lines.split("\\|"); 45 ParesURL pu = new ParesURL(); 46 String url = "NULL"; 47 if (strs.length > 25) { 48 url = pu.execute(strs[25], "HOST"); 49 } 50 String keys = ""; 51 String values = ""; 52 if (strs.length > 16) { 53 keys = strs[0] + "|" + strs[1] + "|" + strs[2] + "|" + strs[3] 54 + "|" + strs[4] + "|" + use_seg(strs[5]) + "|" 55 + strs[11] + "|" + strs[16] + "|" + url + "|" + strs[7] 56 + "|" + strs[8] + "|" + strs[9] + "|" + strs[10] + "|"; 57 } 58 if (strs.length > 15) { 59 values = url + "|" + strs[13] + "|" + strs[15] + "|" + "1"; 60 } 61 context.write(new Text(keys), new Text(values)); 62 } 63 64 public String use_seg(String start_date) { 65 String s = "**"; 66 if (start_date.toString().length() > 23) { 67 if (isNum(start_date.toString().substring(11, 13)) 68 && Integer.parseInt(start_date.toString().substring(11, 69 13)) >= 0 70 && Integer.parseInt(start_date.toString().substring(11, 71 13)) <= 23) { 72 s = start_date.toString().substring(11, 13); 73 } 74 } 75 return s; 76 } 77 78 public static boolean isNum(String str) { 79 return str 80 .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$"); 81 } 82 } 83 84 public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> { 85 private HTable table; 86 87 @Override 88 protected void reduce(Text arg0, Iterable<Text> arg1, Context context) 89 throws IOException, InterruptedException { 90 String keys = arg0.toString(); 91 String value[] = { "" }; 92 String url = "NULL"; 93 String visitIP = "NULL"; 94 String value2 = "NULL"; 95 for (Text c : arg1) { 96 value = c.toString().split("\\|"); 97 if (value.length > 0) { 98 url = value[0]; 99 } 100 if (value.length > 1) { 101 visitIP = value[1]; 102 } 103 if (value.length > 2) { 104 value2 = value[2]; 105 } 106 } 107 String matchResult = urlMatch(url); 108 if (matchResult.equals("NULL")) { 109 matchResult = urlMatch(visitIP); 110 } 111 String output = matchResult + "|" + value2 + "|" + "1"; 112 // System.out.println(output+"+++++++++++++++++"); 113 context.write(new Text(keys), new Text(output)); 114 } 115 116 @Override 117 protected void cleanup(Context context) throws IOException, 118 InterruptedException { 119 super.cleanup(context); 120 table.close(); 121 } 122 123 @Override 124 protected void setup(Context context) throws IOException, 125 InterruptedException { 126 // TODO Auto-generated method stub 127 super.setup(context); 128 HTablePool pool = new HTablePool(HBaseMain.conf, 1000); 129 table = (HTable) pool.getTable("22222"); 130 } 131 132 public String urlMatch(String url) { 133 String s = "NULL"; 134 if (url == null || url.equals("NULL")) { 135 s = "NULL"; 136 } else { 137 try { 138 Get getu = new Get(url.getBytes()); 139 Result ru = table.get(getu); 140 if (!ru.isEmpty()) { 141 s = new String(ru.getValue("123".getBytes(), "456".getBytes())); 142 } 143 } catch (IOException e) { 144 e.printStackTrace(); 145 } 146 } 147 return s; 148 } 149 } 150 }
後來發現效率很低,主要是每一條數據都要訪問hbase而且進行隨機查詢,因此後來轉換方法,查詢時先將row組裝成list,而後再去查詢,時間幾乎是原來的一半。ide
改進後的代碼:oop
1 package com.bonc.db; 2 3 import java.io.IOException; 4 import java.util.ArrayList; 5 import java.util.Iterator; 6 import java.util.List; 7 8 import org.apache.hadoop.conf.Configuration; 9 import org.apache.hadoop.fs.Path; 10 import org.apache.hadoop.hbase.client.Get; 11 import org.apache.hadoop.hbase.client.HTable; 12 import org.apache.hadoop.hbase.client.HTablePool; 13 import org.apache.hadoop.hbase.client.Result; 14 import org.apache.hadoop.io.LongWritable; 15 import org.apache.hadoop.io.Text; 16 import org.apache.hadoop.mapred.Counters.Counter; 17 import org.apache.hadoop.mapreduce.Job; 18 import org.apache.hadoop.mapreduce.Mapper; 19 import org.apache.hadoop.mapreduce.Reducer; 20 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 23 import com.bonc.URLMatch.HBaseMain; 24 25 public class DWA_S_D_USE_MB_COUNT_BASE { 26 public static void main(String args[]) throws Exception { 27 Configuration conf = new Configuration(); 28 Job job = new Job(conf, "DWA_S_D_USE_MB_COUNT_BASE2"); 29 job.setJarByClass(DWA_S_D_USE_MB_COUNT_BASE.class); 30 job.setMapperClass(DataCleanMapper.class); 31 job.setReducerClass(DataCleanReduce.class); 32 job.setNumReduceTasks(150); 33 job.setOutputKeyClass(Text.class); 34 job.setOutputValueClass(Text.class); 35 job.setMapOutputKeyClass(Text.class); 36 job.setMapOutputValueClass(Text.class); 37 FileInputFormat.addInputPath(job, new Path(args[0])); 38 FileOutputFormat.setOutputPath(job, new Path(args[1])); 39 System.exit(job.waitForCompletion(true) ? 0 : 1); 40 } 41 42 public static class DataCleanMapper extends 43 Mapper<LongWritable, Text, Text, Text> { 44 public static Counter ct = null; 45 public static long i = 0; 46 47 @Override 48 protected void map(LongWritable key, Text value, Context context) 49 throws IOException, InterruptedException { 50 //之因此在後面+1,是爲了保證若是後面的幾個字段都爲空的話,依然能夠輸出這個字段! 51 String lines = value.toString()+"|"+"1"; 52 String[] strs = lines.split("\\|"); 53 ParesURL pu = new ParesURL(); 54 String url = "NULL"; 55 String keys = ""; 56 String values = ""; 57 if (strs.length > 25) { 58 i++; 59 if(!strs[25].startsWith("http://")){ 60 strs[25]="http://"+strs[25]; 61 } 62 url = pu.execute(EmptyParse(strs[25]), "HOST"); 63 keys = EmptyParse(strs[0]) + "|" + EmptyParse(strs[1]) + "|" 64 + EmptyParse(strs[2]) + "|" + EmptyParse(strs[3]) + "|" 65 + EmptyParse(strs[4]) + "|" 66 + EmptyParse(use_seg(strs[5])) + "|" 67 + EmptyParse(strs[11]) + "|" + EmptyParse(strs[16]) 68 + "|" + EmptyParse(url) + "|" + EmptyParse(strs[7]) 69 + "|" + EmptyParse(strs[8]) + "|" + EmptyParse(strs[9]) 70 + "|" + EmptyParse(strs[10]) + "|"; 71 values = EmptyParse(url) + "|" + EmptyParse(strs[13]) + "|" 72 + EmptyParse(strs[15]) + "|" + i; 73 context.write(new Text(String.valueOf(i % 10000)), new Text( 74 keys + values)); 75 } 76 } 77 78 public String use_seg(String start_date) { 79 String s = "**"; 80 if (start_date.toString().length() > 23) { 81 if (isNum(start_date.toString().substring(11, 13)) 82 && Integer.parseInt(start_date.toString().substring(11, 83 13)) >= 0 84 && Integer.parseInt(start_date.toString().substring(11, 85 13)) <= 23) { 86 s = start_date.toString().substring(11, 13); 87 } 88 } 89 return s; 90 } 91 92 public static boolean isNum(String str) { 93 return str 94 .matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$"); 95 } 96 97 public static String EmptyParse(String str) { 98 if (str == null || str.length() < 1 || str.equals("") 99 || str.isEmpty()) { 100 return "NULL"; 101 } else { 102 return str; 103 } 104 } 105 } 106 107 public static class DataCleanReduce extends Reducer<Text, Text, Text, Text> { 108 private HTable table; 109 private long index = 0; 110 111 @Override 112 protected void reduce(Text arg0, Iterable<Text> arg1, Context context) 113 throws IOException, InterruptedException { 114 String keys = arg0.toString(); 115 String value[] = { "" }; 116 String url = "NULL"; 117 String visitIP = "NULL"; 118 String value2 = "NULL"; 119 String reduceoutput = "NULL"; 120 String urlMatch = "NULL"; 121 String output = "NULL"; 122 Get getu; 123 Get getip; 124 List<Get> lg = new ArrayList<Get>(); 125 List<Get> li = new ArrayList<Get>(); 126 List<String> lo = new ArrayList<String>(); 127 List<String> useragent = new ArrayList<String>(); 128 for (Text c : arg1) { 129 value = c.toString().split("\\|"); 130 url = value[13]; 131 visitIP = value[14]; 132 value2 = value[15]; 133 output = value[0] + "|" + value[1] + "|" + value[2] + "|" 134 + value[3] + "|" + value[4] + "|" + value[5] + "|" 135 + value[6] + "|" + value[7] + "|" + value[8] + "|" 136 + value[9] + "|" + value[10] + "|" + value[11] + "|" 137 + value[12] + "|"; 138 getu = new Get(url.getBytes()); 139 getip = new Get(visitIP.getBytes()); 140 lg.add(getu); 141 li.add(getip); 142 lo.add(output); 143 useragent.add(value2); 144 } 145 146 Result ru[]; 147 Result ri[]; 148 ru = table.get(lg); 149 ri = table.get(li); 150 for (int i = 0; i < lo.size(); i++) { 151 152 if (!ru[i].isEmpty()) { 153 urlMatch = new String(ru[i].getValue("url_type".getBytes(), 154 "type".getBytes())); 155 } else if (!ri[i].isEmpty()) { 156 urlMatch = new String(ri[i].getValue("url_type".getBytes(), 157 "type".getBytes())); 158 } 159 reduceoutput = urlMatch + "|" + useragent.get(i) + "|" + "1"; 160 context.write(new Text(lo.get(i)), new Text(reduceoutput)); 161 } 162 } 163 164 @Override 165 protected void cleanup(Context context) throws IOException, 166 InterruptedException { 167 super.cleanup(context); 168 table.close(); 169 } 170 171 @Override 172 protected void setup(Context context) throws IOException, 173 InterruptedException { 174 // TODO Auto-generated method stub 175 super.setup(context); 176 HTablePool pool = new HTablePool(HBaseMain.conf, 1000); 177 table = (HTable) pool.getTable("url_rule"); 178 } 179 180 public String urlMatch(String url) { 181 String s = "NULL"; 182 Result ru; 183 if (url == null || url.equals("NULL")) { 184 s = "NULL"; 185 } else { 186 try { 187 Get getu = new Get(123.getBytes()); 188 ru = table.get(getu); 189 if (!ru.isEmpty()) { 190 s = new String(ru.getValue("123123".getBytes(), 191 "123".getBytes())); 192 } 193 } catch (IOException e) { 194 e.printStackTrace(); 195 } 196 } 197 return s; 198 } 199 } 200 }
在有限的資源下,能夠激發一我的的創造力。用這句話做爲總結吧。url