原文:http://blog.csdn.net/rongyongfeikai2/article/details/50727661
在使用Spark streaming讀取kafka數據時,爲了避免數據丟失,我們會在zookeeper中保存kafka的topic對應的partition的offset信息(每次執行成功後,才更新zk中的offset信息);從而保證執行失敗的下一輪,可以從特定的offset開始讀。
實現方式類似下面文章所示:
http://blog.csdn.net/rongyongfeikai2/article/details/49784785
但,kafka的topic是可能會被刪除的,而更糟糕的情況是,用戶又新建了一個相同名字的topic。這是,zk中保存的offset信息會已經不再準確了,此時就需要與kafka的broker保存的offset信息進行比對,從而把zk中的offset信息修正成功。
實現方式如下:
1.用一個類來保存特定topic的leader信息,以及partition的offset信息
- import java.io.Serializable;
- import java.util.HashMap;
-
- /**
- * @function:kafka記錄類
- */
- public class KafkaTopicOffset implements Serializable{
- private String topicName;
- private HashMap<Integer,Long> offsetList;
- private HashMap<Integer,String> leaderList;
-
- public KafkaTopicOffset(String topicName){
- this.topicName = topicName;
- this.offsetList = new HashMap<Integer,Long>();
- this.leaderList = new HashMap<Integer, String>();
- }
-
- public String getTopicName() {
- return topicName;
- }
-
- public HashMap<Integer, Long> getOffsetList() {
- return offsetList;
- }
-
- public void setTopicName(String topicName) {
- this.topicName = topicName;
- }
-
- public void setOffsetList(HashMap<Integer, Long> offsetList) {
- this.offsetList = offsetList;
- }
-
- public HashMap<Integer, String> getLeaderList() {
- return leaderList;
- }
-
- public void setLeaderList(HashMap<Integer, String> leaderList) {
- this.leaderList = leaderList;
- }
-
- public String toString(){
- return "topic:"+topicName+",offsetList:"+this.offsetList+",leaderList:"+this.leaderList;
- }
- }
2.從kafka的broker中得到topic-partition的offset信息(主要是利用SimpleConsumer發送相應的Request)
- import java.io.Serializable;
- import java.util.*;
- import com.nsfocus.bsaips.common.Constant;
- import com.nsfocus.bsaips.model.KafkaTopicOffset;
- import kafka.javaapi.OffsetResponse;
- import kafka.api.PartitionOffsetRequestInfo;
- import kafka.common.TopicAndPartition;
- import kafka.javaapi.TopicMetadataRequest;
- import kafka.javaapi.consumer.SimpleConsumer;
- import kafka.javaapi.TopicMetadata;
- import kafka.javaapi.PartitionMetadata;
-
- /**
- * @function:kafka相關工具類
- */
- public class KafkaUtil implements Serializable {
- private static KafkaUtil kafkaUtil = null;
-
- private KafkaUtil(){}
-
- public static KafkaUtil getInstance(){
- if(kafkaUtil == null){
- kafkaUtil = new KafkaUtil();
- }
- return kafkaUtil;
- }
-
- private String[] getIpsFromBrokerList(String brokerlist){
- StringBuilder sb = new StringBuilder();
- String[] brokers = brokerlist.split(",");
- for(int i=0;i<brokers.length;i++){
- brokers[i] = brokers[i].split(":")[0];
- }
- return brokers;
- }
-
- private Map<String,Integer> getPortFromBrokerList(String brokerlist){
- Map<String,Integer> map = new HashMap<String,Integer>();
- String[] brokers = brokerlist.split(",");
- for(String item:brokers){
- String[] itemArr = item.split(":");
- if(itemArr.length > 1){
- map.put(itemArr[0],Integer.parseInt(itemArr[1]));
- }
- }
- return map;
- }
-
- public KafkaTopicOffset topicMetadataRequest(String brokerlist,String topic){
- List<String> topics = Collections.singletonList(topic);
- TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(topics);
-
- KafkaTopicOffset kafkaTopicOffset = new KafkaTopicOffset(topic);
- String[] seeds = getIpsFromBrokerList(brokerlist);
- Map<String,Integer> portMap = getPortFromBrokerList(brokerlist);
-
- for(int i=0;i<seeds.length;i++){
- SimpleConsumer consumer = null;
- try{
- consumer = new SimpleConsumer(seeds[i],
- portMap.get(seeds[i]),
- Constant.TIMEOUT,
- Constant.BUFFERSIZE,
- Constant.groupId);
- kafka.javaapi.TopicMetadataResponse resp = consumer.send(topicMetadataRequest);
- List<TopicMetadata> metaData = resp.topicsMetadata();
- for (TopicMetadata item : metaData) {
- for (PartitionMetadata part : item.partitionsMetadata()) {
- kafkaTopicOffset.getLeaderList().put(part.partitionId(),part.leader().host());
- kafkaTopicOffset.getOffsetList().put(part.partitionId(),0L);
- }
- }
- }catch(Exception ex){
- ex.printStackTrace();
- }finally{
- if(consumer != null){
- consumer.close();
- }
- }
- }
-
- return kafkaTopicOffset;
- }
-
- public KafkaTopicOffset getLastOffsetByTopic(String brokerlist,String topic){
- KafkaTopicOffset kafkaTopicOffset = topicMetadataRequest(brokerlist, topic);
- String[] seeds = getIpsFromBrokerList(brokerlist);
- Map<String,Integer> portMap = getPortFromBrokerList(brokerlist);
-
- for(int i=0;i<seeds.length;i++){
- SimpleConsumer consumer = null;
- Iterator iterator = kafkaTopicOffset.getOffsetList().entrySet().iterator();
-
- try{
- consumer = new SimpleConsumer(seeds[i],
- portMap.get(seeds[i]),
- Constant.TIMEOUT,
- Constant.BUFFERSIZE,
- Constant.groupId);
-
- while(iterator.hasNext()){
- Map.Entry<Integer,Long> entry = (Map.Entry<Integer, Long>) iterator.next();
- int partitonId = entry.getKey();
-
- if(!kafkaTopicOffset.getLeaderList().get(partitonId).equals(seeds[i])){
- continue;
- }
-
- TopicAndPartition topicAndPartition = new TopicAndPartition(topic,
- partitonId);
- Map<TopicAndPartition,PartitionOffsetRequestInfo> requestInfo =
- new HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
-
- requestInfo.put(topicAndPartition,
- new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.LatestTime(),1)
- );
- kafka.javaapi.OffsetRequest request = new kafka.javaapi.OffsetRequest(
- requestInfo, kafka.api.OffsetRequest.CurrentVersion(),
- Constant.groupId);
- OffsetResponse response = consumer.getOffsetsBefore(request);
- long[] offsets = response.offsets(topic,partitonId);
- if(offsets.length > 0){
- kafkaTopicOffset.getOffsetList().put(partitonId,offsets[0]);
- }
- }
- }catch(Exception ex){
- ex.printStackTrace();
- }finally{
- if(consumer != null){
- consumer.close();
- }
- }
- }
-
- return kafkaTopicOffset;
- }
-
- public Map<String,KafkaTopicOffset> getKafkaOffsetByTopicList(String brokerList,List<String> topics){
- Map<String,KafkaTopicOffset> map = new HashMap<String,KafkaTopicOffset>();
- for(int i=0;i<topics.size();i++){
- map.put(topics.get(i),getLastOffsetByTopic(brokerList, topics.get(i)));
- }
- return map;
- }
-
- public static void main(String[] args){
- try{
- System.out.println(KafkaUtil.getInstance().getKafkaOffsetByTopicList(
- ConfigUtil.getInstance().getKafkaConf().get("brokerlist"),
- Arrays.asList(new String[]{"pj_test_tmp","test"})));
- }catch(Exception ex) {
- ex.printStackTrace();
- }
- }
- }
3.再在KafkaCluster從zk中得到offset信息時,與從broker得到的offset信息中比對(假定調用KafkaUtil的getKafkaOffsetByTopicList得到的返回值放在了offsetMap中):

經過討論,已經知道early offset是最新的起始offset的值,而last offset則是最新的終止offset的值,所以應對過期的情況,應該是從最新的起始offset開始消費。所以應該發送的是EarliestOffsetRequest而非LastOffsetRequest。修改後的代碼如下:
- package com.nsfocus.bsaips.util;
- import java.io.Serializable;
- import java.util.*;
- import com.nsfocus.bsaips.common.Constant;
- import com.nsfocus.bsaips.model.KafkaTopicOffset;
- import kafka.javaapi.OffsetResponse;
- import kafka.api.PartitionOffsetRequestInfo;
- import kafka.common.TopicAndPartition;
- import kafka.javaapi.TopicMetadataRequest;
- import kafka.javaapi.consumer.SimpleConsumer;
- import kafka.javaapi.TopicMetadata;
- import kafka.javaapi.PartitionMetadata;
-
- /**
- * @function:kafka相關工具類
- */
- public class KafkaUtil implements Serializable {
- private static KafkaUtil kafkaUtil = null;
-
- private KafkaUtil(){}
-
- public static KafkaUtil getInstance(){
- if(kafkaUtil == null){
- kafkaUtil = new KafkaUtil();
- }
- return kafkaUtil;
- }
-
- private String[] getIpsFromBrokerList(String brokerlist){
- StringBuilder sb = new StringBuilder();
- String[] brokers = brokerlist.split(",");
- for(int i=0;i<brokers.length;i++){
- brokers[i] = brokers[i].split(":")[0];
- }
- return brokers;
- }
-
- private Map<String,Integer> getPortFromBrokerList(String brokerlist){
- Map<String,Integer> map = new HashMap<String,Integer>();
- String[] brokers = brokerlist.split(",");
- for(String item:brokers){
- String[] itemArr = item.split(":");
- if(itemArr.length > 1){
- map.put(itemArr[0],Integer.parseInt(itemArr[1]));
- }
- }
- return map;
- }
-
- public KafkaTopicOffset topicMetadataRequest(String brokerlist,String topic){
- List<String> topics = Collections.singletonList(topic);
- TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(topics);
-
- KafkaTopicOffset kafkaTopicOffset = new KafkaTopicOffset(topic);
- String[] seeds = getIpsFromBrokerList(brokerlist);
- Map<String,Integer> portMap = getPortFromBrokerList(brokerlist);
-
- for(int i=0;i<seeds.length;i++){
- SimpleConsumer consumer = null;
- try{
- consumer = new SimpleConsumer(seeds[i],
- portMap.get(seeds[i]),
- Constant.TIMEOUT,
- &n"> Map<String,Integer> portMap = getPortFromBrokerList(brokerlist);
-
- for(int i=0;i<seeds.length;i++){
- SimpleConsumer consumer = null;
- try{
- consumer = new SimpleConsumer(seeds[i],
- portMap.get(seeds[i]),
- Constant.TIMEOUT,
- Constant.BUFFERSIZE,
- Constant.groupId);
- kafka.javaapi.TopicMetadataResponse resp = consumer.send(topicMetadataRequest);
- List<TopicMetadata> metaData = resp.topicsMetadata();
- for (TopicMetadata item : metaData) {
- for (PartitionMetadata part : item.partitionsMetadata()) {