【原創】大數據基礎之Flume(2)kudu sink

kudu中的flume sink代碼路徑:java

https://github.com/apache/kudu/tree/master/java/kudu-flume-sinkgit

 

kudu-flume-sink默認使用的producer是github

org.apache.kudu.flume.sink.SimpleKuduOperationsProducerapache

public List<Operation> getOperations(Event event) throws FlumeException { try { Insert insert = table.newInsert(); PartialRow row = insert.getRow(); row.addBinary(payloadColumn, event.getBody()); return Collections.singletonList((Operation) insert); } catch (Exception e) { throw new FlumeException("Failed to create Kudu Insert object", e); } }

是將消息直接存放到一個payload列中json

 

若是想要支持json格式數據,須要二次開發app

package com.cloudera.kudu; public class JsonKuduOperationsProducer implements KuduOperationsProducer {

網上已經有人共享出來代碼:https://cloud.tencent.com/developer/article/1158194ide

可是以上代碼有幾個不方便的地方,1)不容許null;2)對時間類型支持很差;3)全部的值必須是string,而後根據kudu中字段類型進行解析,在生成數據時須要注意,不然須要自行修改代碼;this

 

針對以上不便修改後代碼以下:google

JsonKuduOperationsProducer.javaspa

package com.cloudera.kudu; import com.google.common.collect.Lists; import com.google.common.base.Preconditions; import org.apache.avro.data.Json; import org.json.JSONObject; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.FlumeException; import org.apache.flume.annotations.InterfaceAudience; import org.apache.flume.annotations.InterfaceStability; import org.apache.kudu.ColumnSchema; import org.apache.kudu.Schema; import org.apache.kudu.Type; import org.apache.kudu.client.KuduTable; import org.apache.kudu.client.Operation; import org.apache.kudu.client.PartialRow; import org.apache.kudu.flume.sink.KuduOperationsProducer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.Charset; import java.text.SimpleDateFormat; import java.util.List; import java.util.TimeZone; import java.util.function.Function; @InterfaceAudience.Public @InterfaceStability.Evolving public class JsonKuduOperationsProducer implements KuduOperationsProducer { private static final Logger logger = LoggerFactory.getLogger(JsonKuduOperationsProducer.class); private static final String INSERT = "insert"; private static final String UPSERT = "upsert"; private static final List<String> validOperations = Lists.newArrayList(UPSERT, INSERT); public static final String ENCODING_PROP = "encoding"; public static final String DEFAULT_ENCODING = "utf-8"; public static final String OPERATION_PROP = "operation"; public static final String DEFAULT_OPERATION = UPSERT; public static final String SKIP_MISSING_COLUMN_PROP = "skipMissingColumn"; public static final boolean DEFAULT_SKIP_MISSING_COLUMN = false; public static final String SKIP_BAD_COLUMN_VALUE_PROP = "skipBadColumnValue"; public static final boolean DEFAULT_SKIP_BAD_COLUMN_VALUE = false; public static final String WARN_UNMATCHED_ROWS_PROP = "skipUnmatchedRows"; public static final boolean DEFAULT_WARN_UNMATCHED_ROWS = true; private KuduTable table; private Charset charset; private String operation; private boolean skipMissingColumn; private boolean skipBadColumnValue; private boolean warnUnmatchedRows; public JsonKuduOperationsProducer() { } @Override public void configure(Context context) { String charsetName = context.getString(ENCODING_PROP, DEFAULT_ENCODING); try { charset = Charset.forName(charsetName); } catch (IllegalArgumentException e) { throw new FlumeException( String.format("Invalid or unsupported charset %s", charsetName), e); } operation = context.getString(OPERATION_PROP, DEFAULT_OPERATION).toLowerCase(); Preconditions.checkArgument( validOperations.contains(operation), "Unrecognized operation '%s'", operation); skipMissingColumn = context.getBoolean(SKIP_MISSING_COLUMN_PROP, DEFAULT_SKIP_MISSING_COLUMN); skipBadColumnValue = context.getBoolean(SKIP_BAD_COLUMN_VALUE_PROP, DEFAULT_SKIP_BAD_COLUMN_VALUE); warnUnmatchedRows = context.getBoolean(WARN_UNMATCHED_ROWS_PROP, DEFAULT_WARN_UNMATCHED_ROWS); } @Override public void initialize(KuduTable table) { this.table = table; } @Override public List<Operation> getOperations(Event event) throws FlumeException { String raw = new String(event.getBody(), charset); logger.info("get raw: " + raw); List<Operation> ops = Lists.newArrayList(); if(raw != null && !raw.isEmpty()) { JSONObject json = null; //just pass if it is not a json
            try { json = new JSONObject(raw); } catch (Exception e) { e.printStackTrace(); } if (json != null) { Schema schema = table.getSchema(); Operation op; switch (operation) { case UPSERT: op = table.newUpsert(); break; case INSERT: op = table.newInsert(); break; default: throw new FlumeException( String.format("Unrecognized operation type '%s' in getOperations(): " +
                                        "this should never happen!", operation)); } //just record the error event into log and pass
                try { PartialRow row = op.getRow(); for (ColumnSchema col : schema.getColumns()) { try { if (json.has(col.getName()) && json.get(col.getName()) != null) coerceAndSet(json.get(col.getName()), col.getName(), col.getType(), col.isKey(), col.isNullable(), col.getDefaultValue(), row); else if (col.isKey() || !col.isNullable()) throw new RuntimeException("column : " + col.getName() + " is null or not exists in " + row); } catch (NumberFormatException e) { String msg = String.format( "Raw value '%s' couldn't be parsed to type %s for column '%s'", raw, col.getType(), col.getName()); logOrThrow(skipBadColumnValue, msg, e); } catch (IllegalArgumentException e) { String msg = String.format( "Column '%s' has no matching group in '%s'", col.getName(), raw); logOrThrow(skipMissingColumn, msg, e); } } ops.add(op); } catch (Exception e) { logger.error("get error [" + e.getMessage() + "]: " + raw, e); } } } return ops; } protected <T> T getValue(T defaultValue, Object val, boolean isKey, boolean isNullable, Object columnDefaultValue, boolean compressException, Function<String, T> fromStr) { T result = defaultValue; try { if (val == null) { if (isKey || !isNullable) { throw new RuntimeException("column is key or not nullable"); } if (columnDefaultValue != null && !"null".equals(columnDefaultValue)) { if (columnDefaultValue instanceof String) result = fromStr.apply((String)columnDefaultValue); else result = (T)columnDefaultValue; } } else { boolean isConverted = false; //handle: try to convert directly // try { // result = (T)val; // isConverted = true; // } catch (Exception e1) { //// e1.printStackTrace(); // } //handle: parse from string
                if (!isConverted) result = fromStr.apply(val.toString()); } } catch(Exception e) { if (compressException) e.printStackTrace(); else throw e; } return result; } private SimpleDateFormat[] sdfs = new SimpleDateFormat[]{ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.000'Z'"), new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"), new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") }; { for (SimpleDateFormat sdf : sdfs) sdf.setTimeZone(TimeZone.getTimeZone("UTC")); } private void coerceAndSet(Object rawVal, String colName, Type type, boolean isKey, boolean isNullable, Object defaultValue, PartialRow row) throws NumberFormatException { switch (type) { case INT8: row.addByte(colName, (rawVal != null && rawVal instanceof Boolean) ? (Boolean)rawVal ? (byte)1 : (byte)0  : this.getValue((byte)0, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Byte.parseByte(str))); break; case INT16: row.addShort(colName, this.getValue((short)0, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Short.parseShort(str))); break; case INT32: row.addInt(colName, this.getValue(0, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Integer.parseInt(str))); break; case INT64: row.addLong(colName, this.getValue(0l, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Long.parseLong(str))); break; case BINARY: row.addBinary(colName, rawVal == null ? new byte[0] : rawVal.toString().getBytes(charset)); break; case STRING: row.addString(colName, rawVal == null ? "" : rawVal.toString()); break; case BOOL: row.addBoolean(colName, this.getValue(false, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Boolean.parseBoolean(str))); break; case FLOAT: row.addFloat(colName, this.getValue(0f, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Float.parseFloat(str))); break; case DOUBLE: row.addDouble(colName, this.getValue(0d, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> Double.parseDouble(str))); break; case UNIXTIME_MICROS: Long value = this.<Long>getValue(null, rawVal, isKey, isNullable, defaultValue, this.skipBadColumnValue, (String str) -> { Long result = null; if (str != null && !"".equals(str)) { boolean isPatternOk =false; //handle: yyyy-MM-dd HH:mm:ss
                        if (str.contains("-") && str.contains(":")) { for (SimpleDateFormat sdf : sdfs) { try { result = sdf.parse(str).getTime() * 1000; isPatternOk = true; break; } catch (Exception e) { // e.printStackTrace();
 } } } //handle: second, millisecond, microsecond
                        if (!isPatternOk && (str.length() == 10 || str.length() == 13 || str.length() == 16)) { result = Long.parseLong(str); if (str.length() == 10) result *= 1000000; if (str.length() == 13) result *= 1000; } } return result; }); if (value != null) row.addLong(colName, value); break; default: logger.warn("got unknown type {} for column '{}'-- ignoring this column", type, colName); } } private void logOrThrow(boolean log, String msg, Exception e) throws FlumeException { if (log) { logger.warn(msg, e); } else { throw new FlumeException(msg, e); } } @Override public void close() { } }

去掉類JsonStr2Map,主要是getValue和coerceAndSet配合,支持默認值,支持null,支持傳遞任意類型(自動適配處理),支持boolean轉byte,時間類型支持yyyy-MM-dd HH:mm:ss等pattern和秒、毫秒、微秒4種格式,而且會自動將秒和毫秒轉成微秒;

注意SimpleDateFormat設置timezone爲UTC,這裏是爲了保證消息中的時間和寫入kudu中的時間一致,不然會根據timezone作偏移,好比timezone爲Asia/Shanghai,則寫入kudu的時間會比消息中的時間晚8小時;

 

打包放到$FLUME_HOME/lib下

相關文章
相關標籤/搜索