CheckpointBarrierHandlerjava
這個接口用於react從input channel過來的checkpoint barrier,這裏能夠經過不一樣的實現來,決定是簡單的track barriers,仍是要去真正的block inputsreact
/** * The CheckpointBarrierHandler reacts to checkpoint barrier arriving from the input channels. * Different implementations may either simply track barriers, or block certain inputs on * barriers. */ public interface CheckpointBarrierHandler { /** * Returns the next {@link BufferOrEvent} that the operator may consume. * This call blocks until the next BufferOrEvent is available, ir until the stream * has been determined to be finished. * * @return The next BufferOrEvent, or {@code null}, if the stream is finished. * @throws java.io.IOException Thrown, if the network or local disk I/O fails. * @throws java.lang.InterruptedException Thrown, if the thread is interrupted while blocking during * waiting for the next BufferOrEvent to become available. */ BufferOrEvent getNextNonBlocked() throws IOException, InterruptedException; /** * Registers the given event handler to be notified on successful checkpoints. * * @param checkpointHandler The handler to register. */ void registerCheckpointEventHandler(EventListener<CheckpointBarrier> checkpointHandler); /** * Cleans up all internally held resources. * * @throws IOException Thrown, if the cleanup of I/O resources failed. */ void cleanup() throws IOException; /** * Checks if the barrier handler has buffered any data internally. * @return True, if no data is buffered internally, false otherwise. */ boolean isEmpty(); }
BarrierBufferapi
/** * The barrier buffer is {@link CheckpointBarrierHandler} that blocks inputs with barriers until * all inputs have received the barrier for a given checkpoint. * * <p>To avoid back-pressuring the input streams (which may cause distributed deadlocks), the * BarrierBuffer continues receiving buffers from the blocked channels and stores them internally until * the blocks are released.</p> */ public class BarrierBuffer implements CheckpointBarrierHandler { /** The gate that the buffer draws its input from */ private final InputGate inputGate; //輸入 /** Flags that indicate whether a channel is currently blocked/buffered */ private final boolean[] blockedChannels; //被blocked的channels /** The total number of channels that this buffer handles data from */ private final int totalNumberOfInputChannels; /** To utility to write blocked data to a file channel */ private final BufferSpiller bufferSpiller; //爲了避免形成反壓,對於被block的channl,不會真正的block,而是隻是把數據放到buffer中 /** The pending blocked buffer/event sequences. Must be consumed before requesting * further data from the input gate. */ private final ArrayDeque<BufferSpiller.SpilledBufferOrEventSequence> queuedBuffered; //更多的沒來得及處理的unblock buffer數據 /** The sequence of buffers/events that has been unblocked and must now be consumed * before requesting further data from the input gate */ private BufferSpiller.SpilledBufferOrEventSequence currentBuffered; //由bufferSpiller進行rollover產生的,已經unblock的buffer數據 /** Handler that receives the checkpoint notifications */ private EventListener<CheckpointBarrier> checkpointHandler; //建立checkpoint的邏輯 /** The ID of the checkpoint for which we expect barriers */ private long currentCheckpointId = -1L; /** The number of received barriers (= number of blocked/buffered channels) */ private int numBarriersReceived; /** The number of already closed channels */ private int numClosedChannels; /** Flag to indicate whether we have drawn all available input */ private boolean endOfStream; }
最關鍵的函數,dom
getNextNonBlocked
@Override public BufferOrEvent getNextNonBlocked() throws IOException, InterruptedException { while (true) { // process buffered BufferOrEvents before grabbing new ones BufferOrEvent next; if (currentBuffered == null) { //若是currentBuffered爲空,說明沒有unblock的buffer數據,直接從inputGate讀取 next = inputGate.getNextBufferOrEvent(); } else { next = currentBuffered.getNext(); //從currentBuffered讀 if (next == null) { //若是讀到的爲空,說明currentBuffered已經爲空 completeBufferedSequence(); //清空當前的currentBuffered,看看queuedBuffered中還有沒有須要處理的buffer return getNextNonBlocked(); } } if (next != null) { if (isBlocked(next.getChannelIndex())) { //若是這個channel仍然是被block的 // if the channel is blocked we, we just store the BufferOrEvent bufferSpiller.add(next); //那麼咱們只是把這個BufferOrEvent放到bufferSpiller裏面 } else if (next.isBuffer()) { //若是沒有被block,就處理該條數據,若是是buffer即真實數據,那麼直接返回該數據 return next; } else if (next.getEvent().getClass() == CheckpointBarrier.class) { //若是是CheckpointBarrier if (!endOfStream) { // process barriers only if there is a chance of the checkpoint completing processBarrier((CheckpointBarrier) next.getEvent(), next.getChannelIndex()); //那麼調用processBarrier,後面具體分析 } } else { if (next.getEvent().getClass() == EndOfPartitionEvent.class) { numClosedChannels++; // no chance to complete this checkpoint releaseBlocks(); //由於某個channel close了,那就永遠也沒法從這個channel獲取barrier了,因此releaseBlocks } return next; } } else if (!endOfStream) { // end of stream. we feed the data that is still buffered endOfStream = true; releaseBlocks();//流結束了,因此也須要releaseBlocks return getNextNonBlocked(); } else { return null; } } }
其中兩個函數比較重要processBarrier和releaseBlockside
processBarrier函數
private void processBarrier(CheckpointBarrier receivedBarrier, int channelIndex) throws IOException { final long barrierId = receivedBarrier.getId(); //取出全局barrier id if (numBarriersReceived > 0) { //若是以前收到過barrier // subsequent barrier of a checkpoint. if (barrierId == currentCheckpointId) { //看下剛收到的和以前的barrierid是否同樣 // regular case onBarrier(channelIndex); //若是同樣調用onBarrier } else if (barrierId > currentCheckpointId) { //若是大於currentCheckpointId,說明這個id已通過期了,由於在一個channel上,barrier id應該是按序發送的 // we did not complete the current checkpoint LOG.warn("Received checkpoint barrier for checkpoint {} before completing current checkpoint {}. " + "Skipping current checkpoint.", barrierId, currentCheckpointId); releaseBlocks(); //既然這個barrier已通過期,因此releaseBlocks() currentCheckpointId = barrierId; //設置新的barrierId onBarrier(channelIndex); } else { //忽略已過時的barrier // ignore trailing barrier from aborted checkpoint return; } } else if (barrierId > currentCheckpointId) { //新的barrier // first barrier of a new checkpoint currentCheckpointId = barrierId; onBarrier(channelIndex); } else { // trailing barrier from previous (skipped) checkpoint return; } // check if we have all barriers if (numBarriersReceived + numClosedChannels == totalNumberOfInputChannels) { //若是咱們已經集齊全部的barrier if (LOG.isDebugEnabled()) { LOG.debug("Received all barrier, triggering checkpoint {} at {}", receivedBarrier.getId(), receivedBarrier.getTimestamp()); } if (checkpointHandler != null) { checkpointHandler.onEvent(receivedBarrier); //觸發生成checkpoint } releaseBlocks(); 調用releaseBlocks } }
這裏的onEvent,在StreamTask中定義,this
protected final EventListener<CheckpointBarrier> getCheckpointBarrierListener() { return new EventListener<CheckpointBarrier>() { @Override public void onEvent(CheckpointBarrier barrier) { try { triggerCheckpoint(barrier.getId(), barrier.getTimestamp()); //作checkpoint } catch (Exception e) { throw new RuntimeException("Error triggering a checkpoint as the result of receiving checkpoint barrier", e); } } }; }
onBarrier,只是置標誌位和計數,比較簡單spa
private void onBarrier(int channelIndex) throws IOException { if (!blockedChannels[channelIndex]) { blockedChannels[channelIndex] = true; numBarriersReceived++; if (LOG.isDebugEnabled()) { LOG.debug("Received barrier from channel " + channelIndex); } } else { throw new IOException("Stream corrupt: Repeated barrier for same checkpoint and input stream"); } }
releaseBlocksdebug
/** * Releases the blocks on all channels. Makes sure the just written data * is the next to be consumed. */ private void releaseBlocks() throws IOException { for (int i = 0; i < blockedChannels.length; i++) { blockedChannels[i] = false; } numBarriersReceived = 0; if (currentBuffered == null) { //理論上,在調用releaseBlocks前,全部channel都是處於blocked狀態,因此currentBuffered應該爲空 // common case: no more buffered data currentBuffered = bufferSpiller.rollOver(); //把block期間buffer的數據文件,設爲currentBuffered if (currentBuffered != null) { currentBuffered.open(); } } else { //不爲空,是uncommon的case // uncommon case: buffered data pending // push back the pending data, if we have any // since we did not fully drain the previous sequence, we need to allocate a new buffer for this one BufferSpiller.SpilledBufferOrEventSequence bufferedNow = bufferSpiller.rollOverWithNewBuffer(); if (bufferedNow != null) { bufferedNow.open(); queuedBuffered.addFirst(currentBuffered); //currentBuffered不爲空,因此先把當前的放到queuedBuffered裏面 currentBuffered = bufferedNow; } } }
看下BufferSpillercode
/** * Creates a new buffer spiller, spilling to one of the I/O manager's temp directories. * * @param ioManager The I/O manager for access to teh temp directories. * @param pageSize The page size used to re-create spilled buffers. * @throws IOException Thrown if the temp files for spilling cannot be initialized. */ public BufferSpiller(IOManager ioManager, int pageSize) throws IOException { this.pageSize = pageSize; this.readBuffer = ByteBuffer.allocateDirect(READ_BUFFER_SIZE); this.readBuffer.order(ByteOrder.LITTLE_ENDIAN); this.headBuffer = ByteBuffer.allocateDirect(16); this.headBuffer.order(ByteOrder.LITTLE_ENDIAN); this.sources = new ByteBuffer[] { this.headBuffer, null }; //sources是由headBuffer和contents組成的 File[] tempDirs = ioManager.getSpillingDirectories(); this.tempDir = tempDirs[DIRECTORY_INDEX.getAndIncrement() % tempDirs.length]; byte[] rndBytes = new byte[32]; new Random().nextBytes(rndBytes); this.spillFilePrefix = StringUtils.byteToHexString(rndBytes) + '.'; // prepare for first contents createSpillingChannel(); } private void createSpillingChannel() throws IOException { //打開用於寫buffer的文件 currentSpillFile = new File(tempDir, spillFilePrefix + (fileCounter++) +".buffer"); currentChannel = new RandomAccessFile(currentSpillFile, "rw").getChannel(); }
主要的function,
add,加BufferOrEvent
/** * Adds a buffer or event to the sequence of spilled buffers and events. * * @param boe The buffer or event to add and spill. * @throws IOException Thrown, if the buffer of event could not be spilled. */ public void add(BufferOrEvent boe) throws IOException { hasWritten = true; try { ByteBuffer contents; if (boe.isBuffer()) { //分爲buffer或event來提取contents Buffer buf = boe.getBuffer(); contents = buf.getMemorySegment().wrap(0, buf.getSize()); } else { contents = EventSerializer.toSerializedEvent(boe.getEvent()); } headBuffer.clear(); //更新headBuffer headBuffer.putInt(boe.getChannelIndex()); headBuffer.putInt(contents.remaining()); headBuffer.put((byte) (boe.isBuffer() ? 0 : 1)); headBuffer.flip(); sources[1] = contents; //爲何加在1,由於0是headBuffer currentChannel.write(sources); //寫入文件 } finally { if (boe.isBuffer()) { boe.getBuffer().recycle(); } } }
rollOverInternal,把當前的spill文件返回, 生成新的spill文件
private SpilledBufferOrEventSequence rollOverInternal(boolean newBuffer) throws IOException { if (!hasWritten) { return null; } ByteBuffer buf; if (newBuffer) { //newBuffer的區別是,是否從新建立ByteBuffer仍是直接用readBuffer buf = ByteBuffer.allocateDirect(READ_BUFFER_SIZE); buf.order(ByteOrder.LITTLE_ENDIAN); } else { buf = readBuffer; } // create a reader for the spilled data currentChannel.position(0L); SpilledBufferOrEventSequence seq = new SpilledBufferOrEventSequence(currentSpillFile, currentChannel, buf, pageSize); //把當前的spill文件封裝成SpilledBufferOrEventSequence // create ourselves a new spill file createSpillingChannel(); //生成新的spill文件 hasWritten = false; return seq; }
對於SpilledBufferOrEventSequence,主要是提供讀取的api,因此關鍵的函數是getNext
/** * This class represents a sequence of spilled buffers and events, created by the * {@link BufferSpiller}. The sequence of buffers and events can be read back using the * method {@link #getNext()}. */ public static class SpilledBufferOrEventSequence { /** * Gets the next BufferOrEvent from the spilled sequence, or {@code null}, if the * sequence is exhausted. * * @return The next BufferOrEvent from the spilled sequence, or {@code null} (end of sequence). * @throws IOException Thrown, if the reads failed, of if the byte stream is corrupt. */ public BufferOrEvent getNext() throws IOException { if (buffer.remaining() < HEADER_LENGTH) { buffer.compact(); while (buffer.position() < HEADER_LENGTH) { if (fileChannel.read(buffer) == -1) { //從文件channel大家把數據讀到buffer中 if (buffer.position() == 0) { // no trailing data return null; } else { throw new IOException("Found trailing incomplete buffer or event"); } } } buffer.flip(); } final int channel = buffer.getInt(); final int length = buffer.getInt(); final boolean isBuffer = buffer.get() == 0; if (isBuffer) { //若是是buffer // deserialize buffer MemorySegment seg = MemorySegmentFactory.allocateUnpooledSegment(pageSize); //建立 MemorySegment,這裏是allocate unpooled的segment int segPos = 0; int bytesRemaining = length; while (true) { int toCopy = Math.min(buffer.remaining(), bytesRemaining); if (toCopy > 0) { seg.put(segPos, buffer, toCopy); //將buffer中的數據寫入MemorySegment segPos += toCopy; bytesRemaining -= toCopy; } if (bytesRemaining == 0) { break; } else { buffer.clear(); if (fileChannel.read(buffer) == -1) { throw new IOException("Found trailing incomplete buffer"); } buffer.flip(); } } Buffer buf = new Buffer(seg, FreeingBufferRecycler.INSTANCE); //將MemorySegment封裝成Buffer buf.setSize(length); return new BufferOrEvent(buf, channel); } else { //若是是event // deserialize event if (buffer.remaining() < length) { buffer.compact(); while (buffer.position() < length) { if (fileChannel.read(buffer) == -1) { throw new IOException("Found trailing incomplete event"); } } buffer.flip(); } int oldLimit = buffer.limit(); buffer.limit(buffer.position() + length); AbstractEvent evt = EventSerializer.fromSerializedEvent(buffer, getClass().getClassLoader()); //將buffer封裝成event buffer.limit(oldLimit); return new BufferOrEvent(evt, channel); } } }
BarrierTracker,這個比Barrier buffer的實現簡單的多,
由於不會去block input channel,因此沒法實現exactly once,只能實現at-least once
/** * The BarrierTracker keeps track of what checkpoint barriers have been received from * which input channels. Once it has observed all checkpoint barriers for a checkpoint ID, * it notifies its listener of a completed checkpoint. * * <p>Unlike the {@link BarrierBuffer}, the BarrierTracker does not block the input * channels that have sent barriers, so it cannot be used to gain "exactly-once" processing * guarantees. It can, however, be used to gain "at least once" processing guarantees.</p> * * <p>NOTE: This implementation strictly assumes that newer checkpoints have higher checkpoint IDs.</p> */ public class BarrierTracker implements CheckpointBarrierHandler { @Override public BufferOrEvent getNextNonBlocked() throws IOException, InterruptedException { while (true) { BufferOrEvent next = inputGate.getNextBufferOrEvent(); if (next == null) { return null; } else if (next.isBuffer() || next.getEvent().getClass() != CheckpointBarrier.class) { //若是是數據就直接返回 return next; } else { processBarrier((CheckpointBarrier) next.getEvent()); //若是是barrier就處理 } } } private void processBarrier(CheckpointBarrier receivedBarrier) { // general path for multiple input channels final long barrierId = receivedBarrier.getId(); // find the checkpoint barrier in the queue of bending barriers CheckpointBarrierCount cbc = null; int pos = 0; for (CheckpointBarrierCount next : pendingCheckpoints) { //找找看,這個barrier是否直接收到過 if (next.checkpointId == barrierId) { cbc = next; break; } pos++; } if (cbc != null) { //若是收到過 // add one to the count to that barrier and check for completion int numBarriersNew = cbc.incrementBarrierCount(); //計數加一 if (numBarriersNew == totalNumberOfInputChannels) { //判斷是否全部的barrier已經到全了 // checkpoint can be triggered // first, remove this checkpoint and all all prior pending // checkpoints (which are now subsumed) for (int i = 0; i <= pos; i++) { pendingCheckpoints.pollFirst(); //當一個checkpoint被觸發時,prior的全部checkpoint就已通過期了,也一塊兒remove掉 } // notify the listener if (checkpointHandler != null) { checkpointHandler.onEvent(receivedBarrier); //若是有checkpoint handler,就調用進行check point } } } else { //新的barrier // first barrier for that checkpoint ID // add it only if it is newer than the latest checkpoint. // if it is not newer than the latest checkpoint ID, then there cannot be a // successful checkpoint for that ID anyways if (barrierId > latestPendingCheckpointID) { latestPendingCheckpointID = barrierId; pendingCheckpoints.addLast(new CheckpointBarrierCount(barrierId)); // make sure we do not track too many checkpoints if (pendingCheckpoints.size() > MAX_CHECKPOINTS_TO_TRACK) { pendingCheckpoints.pollFirst(); //刪除過多的checkpoints } } } } }