tensorflow word2vec demo詳解

時間 2020-12-30

欄目 Microsoft Office 简体版

原文原文鏈接

轉自https://blog.csdn.net/weixin_42001089/article/details/81224869

word2vec有CBOW與Skip-Gram模型

CBOW是根據上下文預測中間值，Skip-Gram則恰恰相反

本文首先介紹Skip-Gram模型，是基於tensorflow官方提供的一個demo，第二大部分是經過簡單修改的CBOW模型，主要參考：

http://www.javashuo.com/article/p-wshmytwc-dt.html

兩部分以###########################爲界限

好了，現在開始！！！！！！

###################################################################################################

tensorflow官方demo:

https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/tutorials/word2vec

（一）首先：就是導入一些包沒什麼可說的

  
  
   
    
     
      
     
     
     
       from __future__ 
      import absolute_import 
      
    
   
    
     
      
     
     
     
       from __future__ 
      import division 
      
    
   
    
     
      
     
     
     
       from __future__ 
      import print_function 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      import collections 
      
    
   
    
     
      
     
     
      
      import math 
      
    
   
    
     
      
     
     
      
      import os 
      
    
   
    
     
      
     
     
      
      import sys 
      
    
   
    
     
      
     
     
      
      import argparse 
      
    
   
    
     
      
     
     
      
      import random 
      
    
   
    
     
      
     
     
     
       from tempfile 
      import gettempdir 
      
    
   
    
     
      
     
     
      
      import zipfile 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      import numpy as np 
      
    
   
    
     
      
     
     
     
       from six.moves 
      import urllib 
      
    
   
    
     
      
     
     
     
       from six.moves 
      import xrange # pylint: disable=redefined-builtin 
      
    
   
    
     
      
     
     
      
      import tensorflow as tf 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       from tensorflow.contrib.tensorboard.plugins 
      import projector

（二）接下來就是獲取當前路徑，以及創建log目錄（主要用於後續的tensorboard可視化）,默認log目錄在當前目錄下：

  
  
   
    
     
      
     
     
     
       current_path = os.path.dirname(os.path.realpath(sys.argv[ 
      0])) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       parser = argparse.ArgumentParser() 
      
    
   
    
     
      
     
     
     
       parser.add_argument( 
      
    
   
    
     
      
     
     
      
      '--log_dir', 
      
    
   
    
     
      
     
     
      
      type=str, 
      
    
   
    
     
      
     
     
      
      default=os.path.join(current_path, 
      'log'), 
      
    
   
    
     
      
     
     
     
       help= 
      'The log directory for TensorBoard summaries.') 
      
    
   
    
     
      
     
     
     
       FLAGS, unparsed = parser.parse_known_args() 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       # Create the directory 
      for TensorBoard variables 
      if there 
      is 
      not. 
      
    
   
    
     
      
     
     
      
      if 
      not os.path.exists(FLAGS.log_dir): 
      
    
   
    
     
      
     
     
     
       os.makedirs(FLAGS.log_dir)

sys.argv[]就是一個從程序外部獲取參數的橋樑，sys.argv[0]就是返回第一個參數，即獲取當前腳本

關於其更多用法可以參考：http://www.javashuo.com/article/p-cthlxfyn-n.html

os.path.realpath就是獲取腳本的絕對路徑

parser.parse_known_args()用來解析不定長的命令行參數，其返回的是2個參數，第一個參數是已經定義了的參數，第二個是沒有定義的參數。

具體到這裏舉個例子就是：寫一個test.py

  
  
   
    
     
      
     
     
      
      import argparse 
      
    
   
    
     
      
     
     
      
      import os 
      
    
   
    
     
      
     
     
      
      import sys 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       current_path = os.path.dirname(os.path.realpath(sys.argv[ 
      0])) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       parser = argparse.ArgumentParser() 
      
    
   
    
     
      
     
     
     
       parser.add_argument( 
      
    
   
    
     
      
     
     
      
      '--log_dir', 
      
    
   
    
     
      
     
     
     
       type=str, 
      
    
   
    
     
      
     
     
     
       default=os.path.join(current_path, 
      'log'), 
      
    
   
    
     
      
     
     
     
       help= 
      'The log directory for TensorBoard summaries.') 
      
    
   
    
     
      
     
     
     
       FLAGS, unparsed = parser.parse_known_args() 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       print(FLAGS) 
      
    
   
    
     
      
     
     
     
       print(unparsed)

（三）接下來是下載數據集（這裏稍微做了一點修改）：

  
  
   
    
     
      
     
     
      
      def maybe_download(filename, expected_bytes): 
      
    
   
    
     
      
     
     
      
      """Download a file if not present, and make sure it's the right size.""" 
      
    
   
    
     
      
     
     
      
      if 
      not os.path.exists(filename): 
      
    
   
    
     
      
     
     
     
       filename, _ = urllib.request.urlretrieve(url + filename, filename) 
      
    
   
    
     
      
     
     
      
      # 獲取文件相關屬性 
      
    
   
    
     
      
     
     
     
       statinfo = os.stat(filename) 
      
    
   
    
     
      
     
     
      
      # 比對文件的大小是否正確 
      
    
   
    
     
      
     
     
      
      if statinfo.st_size == expected_bytes: 
      
    
   
    
     
      
     
     
     
       print( 
      'Found and verified', filename) 
      
    
   
    
     
      
     
     
      
      else: 
      
    
   
    
     
      
     
     
     
       print(statinfo.st_size) 
      
    
   
    
     
      
     
     
      
      raise Exception( 
      
    
   
    
     
      
     
     
      
      'Failed to verify ' + filename + 
      '. Can you get to it with a browser?') 
      
    
   
    
     
      
     
     
      
      return filename 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       filename = maybe_download( 
      'text8.zip', 
      31344016)

下載好後就會在當前文件夾下有一個叫做text8.zip的壓縮包

（四）生成單詞表

  
  
   
    
     
      
     
     
      
      # Read the data into a list of strings. 
      
    
   
    
     
      
     
     
      
      def read_data(filename): 
      
    
   
    
     
      
     
     
      
      """Extract the first file enclosed in a zip file as a list of words.""" 
      
    
   
    
     
      
     
     
      
      with zipfile.ZipFile(filename) 
      as f: 
      
    
   
    
     
      
     
     
     
       data = tf.compat.as_str(f.read(f.namelist()[ 
      0])).split() 
      
    
   
    
     
      
     
     
      
      return data 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       vocabulary = read_data(filename) 
      
    
   
    
     
      
     
     
     
       print( 
      'Data size', len(vocabulary))

f.namelist()[0]是解壓後第一個文件，不過這裏解壓後本來就只有一個文件，然後以空格分開，所以最後的vocabulary中就是單詞表，最後打印一下看看有多少單詞

(五)建立有50000個詞的字典，沒在該詞典的單詞用UNK表示

  
  
   
    
     
      
     
     
     
       vocabulary_size = 
      50000 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      def build_dataset(words, n_words): 
      
    
   
    
     
      
     
     
      
      """Process raw inputs into a dataset.""" 
      
    
   
    
     
      
     
     
     
       count = [[ 
      'UNK', 
      -1]] 
      
    
   
    
     
      
     
     
     
       count.extend(collections.Counter(words).most_common(n_words - 
      1)) 
      
    
   
    
     
      
     
     
     
       dictionary = dict() 
      
    
   
    
     
      
     
     
      
      for word, _ 
      in count: 
      
    
   
    
     
      
     
     
     
       dictionary[word] = len(dictionary) 
      
    
   
    
     
      
     
     
     
       data = list() 
      
    
   
    
     
      
     
     
     
       unk_count = 
      0 
      
    
   
    
     
      
     
     
      
      for word 
      in words: 
      
    
   
    
     
      
     
     
     
       index = dictionary.get(word, 
      0) 
      
    
   
    
     
      
     
     
      
      if index == 
      0: 
      # dictionary['UNK'] 
      
    
   
    
     
      
     
     
     
       unk_count += 
      1 
      
    
   
    
     
      
     
     
     
       data.append(index) 
      
    
   
    
     
      
     
     
     
       count[ 
      0][ 
      1] = unk_count 
      
    
   
    
     
      
     
     
     
       reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
      
    
   
    
     
      
     
     
      
      return data, count, dictionary, reversed_dictionary 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       data, count, dictionary, reverse_dictionary = build_dataset( 
      
    
   
    
     
      
     
     
     
       vocabulary, vocabulary_size) 
      
    
   
    
     
      
     
     
      
      del vocabulary 
      # Hint to reduce memory. 
      
    
   
    
     
      
     
     
     
       print( 
      'Most common words (+UNK)', count[: 
      5]) 
      
    
   
    
     
      
     
     
     
       print( 
      'Sample data', data[: 
      10], [reverse_dictionary[i] 
      for i 
      in data[: 
      10]])

其中下面是統計每個單詞的詞頻，並選取前50000個詞頻較高的單詞作爲字典的備選詞

extend追加一個列表

  
  
count.extend(collections.Counter(words).most_common(n_words - 1))

data是將數據集的單詞都編號，沒有在字典的中單詞編號爲UNK(0)

就想這樣;

i love tensorflow very much .........

2 23 UNK 3 45 .........

count 記錄的是每個單詞對應的詞頻比如；[ ['UNK', -1] , ['a','200'] , ['i',150],...............]

dictionary是一個字典：記錄的是單詞對應編號即key：單詞、value：編號(編號越小，詞頻越高，但第一個永遠是UNK)

reversed_dictionary是一個字典：編號對應的單詞即key：編號、value：單詞(編號越小，詞頻越高，但第一個永遠是UNK)

第一個永遠是UNK是因爲extend追加一個列表，變化的是追加的列表，第一個永遠是UNK

（六）取labels，分批次

  
  
   
    
     
      
     
     
     
       data_index = 
      0 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      def generate_batch(batch_size, num_skips, skip_window): 
      
    
   
    
     
      
     
     
      
      global data_index 
      
    
   
    
     
      
     
     
      
      assert batch_size % num_skips == 
      0 
      
    
   
    
     
      
     
     
      
      assert num_skips <= 
      2 * skip_window 
      
    
   
    
     
      
     
     
     
       batch = np.ndarray(shape=(batch_size), dtype=np.int32) 
      
    
   
    
     
      
     
     
     
       labels = np.ndarray(shape=(batch_size, 
      1), dtype=np.int32) 
      
    
   
    
     
      
     
     
     
       span = 
      2 * skip_window + 
      1 
      # [ skip_window target skip_window ] 
      
    
   
    
     
      
     
     
     
       buffer = collections.deque(maxlen=span) 
      # pylint: disable=redefined-builtin 
      
    
   
    
     
      
     
     
      
      if data_index + span > len(data): 
      
    
   
    
     
      
     
     
     
       data_index = 
      0 
      
    
   
    
     
      
     
     
     
       buffer.extend(data[data_index:data_index + span]) 
      
    
   
    
     
      
     
     
     
       data_index += span 
      
    
   
    
     
      
     
     
      
      for i 
      in range(batch_size // num_skips): 
      
    
   
    
     
      
     
     
     
       context_words = [w 
      for w 
      in range(span) 
      if w != skip_window] 
      
    
   
    
     
      
     
     
     
       words_to_use = random.sample(context_words, num_skips) 
      
    
   
    
     
      
     
     
      
      for j, context_word 
      in enumerate(words_to_use): 
      
    
   
    
     
      
     
     
     
       batch[i * num_skips + j] = buffer[skip_window] 
      
    
   
    
     
      
     
     
     
       labels[i * num_skips + j, 
      0] = buffer[context_word] 
      
    
   
    
     
      
     
     
      
      if data_index == len(data): 
      
    
   
    
     
      
     
     
     
       buffer.extend(data[ 
      0:span]) 
      
    
   
    
     
      
     
     
     
       data_index = span 
      
    
   
    
     
      
     
     
      
      else: 
      
    
   
    
     
      
     
     
     
       buffer.append(data[data_index]) 
      
    
   
    
     
      
     
     
     
       data_index += 
      1 
      
    
   
    
     
      
     
     
      
      # Backtrack a little bit to avoid skipping words in the end of a batch 
      
    
   
    
     
      
     
     
     
       data_index = (data_index + len(data) - span) % len(data) 
      
    
   
    
     
      
     
     
      
      return batch, labels 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       batch, labels = generate_batch(batch_size= 
      8, num_skips= 
      2, skip_window= 
      1) 
      
    
   
    
     
      
     
     
      
      for i 
      in range( 
      8): 
      
    
   
    
     
      
     
     
     
       print(batch[i], reverse_dictionary[batch[i]], 
      '->', labels[i, 
      0], 
      
    
   
    
     
      
     
     
     
       reverse_dictionary[labels[i, 
      0]])

batch_size：就是批次大小

num_skips：就是重複用一個單詞的次數，比如 num_skips=2時，對於一句話：i love tensorflow very much ..........

當tensorflow被選爲目標詞時，在產生label時要利用tensorflow兩次即：

tensorflow---》 love tensorflow---》 very

skip_window：是考慮左右上下文的個數，比如skip_window=1，就是在考慮上下文的時候，左面一個，右面一個

skip_window=2時，就是在考慮上下文的時候，左面兩個，右面兩個

span ：其實在分批次的過程中可以看做是一個固定大小的框框（比較流行的說法數滑動窗口）在不斷移動，而這個框框的大小就是 span，可以看到span = 2 * skip_window + 1

buffer = collections.deque(maxlen=span)：就是申請了一個buffer（其實就是固定大小的窗口這裏是3）即每次這個buffer隊列中最多能容納span個單詞

所以過程應該是這樣的：比如batch_size=6， num_skips=2，skip_window=1，data:

batch_size // num_skips=3,循環3次

( I am looking for the missing glass-shoes who has picked it up .............)

2 23 56 3 45 84 123 45 23 12 1 14 ...............

i=0時：2 ,23 ,56首先進入 buffer（ context_words = [w for w in range(span) if w != skip_window]的意思就是取窗口中不包括目標詞的詞即上下文），然後batch[i * num_skips + j] = buffer[skip_window]（skip_window=1，所以每次就是取窗口的中間數爲目標詞）即batch=23， labels[i * num_skips + j, 0] = buffer[context_word]就是取其上下文爲labels即2和56

所以此時batch=[23,23] labels=[2,56](當然也可能是[2,56]，因爲可能先取右邊，後取左面)，同時data_index=3即單詞for的位置

i=1時：data[data_index]進隊列，即 buffer爲 23,56,3 賦值後爲：batch=[23,23,56,56] labels=[2,56,23,3](也可能是換一下順序)

同時data_index=4即單詞the

i=2時：data[data_index]進隊列，即 buffer爲 56,3,45 賦值後爲：batch=[23,23,56,56,3,3] labels=[2,56,23,3,56,45](也可能是換一下順序) 同時data_index=5即單詞missing

至此循環結束，按要求取出大小爲6的一個批次即：

batch=[23,23,56,56,3,3] labels=[2,56,23,3,56,45]

然後data_index = (data_index + len(data) - span) % len(data)即data_index回溯3個單位，回到 looking,因爲global data_index

所以data_index全局變量，所以當在取下一個批次的時候，buffer從looking的位置開始裝載，即從上一個批次結束的位置接着往下取batch和labels

（七）定義一些參數大小：

  
  
   
    
     
      
     
     
     
       batch_size = 
      128 
      
    
   
    
     
      
     
     
     
       embedding_size = 
      128 # Dimension 
      of the embedding vector. 
      
    
   
    
     
      
     
     
     
       skip_window = 
      1 # How many words 
      to consider left 
      and right. 
      
    
   
    
     
      
     
     
     
       num_skips = 
      2 # How many times 
      to reuse an input 
      to generate a 
      label. 
      
    
   
    
     
      
     
     
     
       num_sampled = 
      64 # Number 
      of negative examples 
      to sample. 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       graph = tf.Graph()

這裏主要就是定義我們上面講的一些參數的大小

（八）神經網絡圖model：

  
  
   
    
     
      
     
     
      
      with graph.as_default(): 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Input data. 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'inputs'): 
      
    
   
    
     
      
     
     
     
       train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) 
      
    
   
    
     
      
     
     
     
       train_labels = tf.placeholder(tf.int32, shape=[batch_size, 
      1]) 
      
    
   
    
     
      
     
     
     
       valid_dataset = tf.constant(valid_examples, dtype=tf.int32) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Ops and variables pinned to the CPU because of missing GPU implementation 
      
    
   
    
     
      
     
     
      
      with tf.device( 
      '/cpu:0'): 
      
    
   
    
     
      
     
     
      
      # Look up embeddings for inputs. 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'embeddings'): 
      
    
   
    
     
      
     
     
     
       embeddings = tf.Variable( 
      
    
   
    
     
      
     
     
     
       tf.random_uniform([vocabulary_size, embedding_size], 
      -1.0, 
      1.0)) 
      
    
   
    
     
      
     
     
     
       embed = tf.nn.embedding_lookup(embeddings, train_inputs) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Construct the variables for the NCE loss 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'weights'): 
      
    
   
    
     
      
     
     
     
       nce_weights = tf.Variable( 
      
    
   
    
     
      
     
     
     
       tf.truncated_normal( 
      
    
   
    
     
      
     
     
     
       [vocabulary_size, embedding_size], 
      
    
   
    
     
      
     
     
     
       stddev= 
      1.0 / math.sqrt(embedding_size))) 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'biases'): 
      
    
   
    
     
      
     
     
     
       nce_biases = tf.Variable(tf.zeros([vocabulary_size])) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Compute the average NCE loss for the batch. 
      
    
   
    
     
      
     
     
      
      # tf.nce_loss automatically draws a new sample of the negative labels each 
      
    
   
    
     
      
     
     
      
      # time we evaluate the loss. 
      
    
   
    
     
      
     
     
      
      # Explanation of the meaning of NCE loss: 
      
    
   
    
     
      
     
     
      
      # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'loss'): 
      
    
   
    
     
      
     
     
     
       loss = tf.reduce_mean( 
      
    
   
    
     
      
     
     
     
       tf.nn.nce_loss( 
      
    
   
    
     
      
     
     
     
       weights=nce_weights, 
      
    
   
    
     
      
     
     
     
       biases=nce_biases, 
      
    
   
    
     
      
     
     
     
       labels=train_labels, 
      
    
   
    
     
      
     
     
     
       inputs=embed, 
      
    
   
    
     
      
     
     
     
       num_sampled=num_sampled, 
      
    
   
    
     
      
     
     
     
       num_classes=vocabulary_size)) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Add the loss value as a scalar to summary. 
      
    
   
    
     
      
     
     
     
       tf.summary.scalar( 
      'loss', loss) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Construct the SGD optimizer using a learning rate of 1.0. 
      
    
   
    
     
      
     
     
      
      with tf.name_scope( 
      'optimizer'): 
      
    
   
    
     
      
     
     
     
       optimizer = tf.train.GradientDescentOptimizer( 
      1.0).minimize(loss) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Compute the cosine similarity between minibatch examples and all embeddings. 
      
    
   
    
     
      
     
     
     
       norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
      1, keepdims= 
      True)) 
      
    
   
    
     
      
     
     
     
       normalized_embeddings = embeddings / norm 
      
    
   
    
     
      
     
     
     
       valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, 
      
    
   
    
     
      
     
     
     
       valid_dataset) 
      
    
   
    
     
      
     
     
     
       similarity = tf.matmul( 
      
    
   
    
     
      
     
     
     
       valid_embeddings, normalized_embeddings, transpose_b= 
      True) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Merge all summaries. 
      
    
   
    
     
      
     
     
     
       merged = tf.summary.merge_all() 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Add variable initializer. 
      
    
   
    
     
      
     
     
     
       init = tf.global_variables_initializer() 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Create a saver. 
      
    
   
    
     
      
     
     
     
       saver = tf.train.Saver()

這裏可以分爲兩部分來看，一部分是訓練Skip-gram模型的詞向量，另一部分是計算餘弦相似度，下面我們分開說：

首先看下tf.nn.embedding_lookup的API解釋：

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/embedding_ops.py

  
  
   
    
     
      
     
     
      
      def embedding_lookup( 
      
    
   
    
     
      
     
     
      
       params, 
      
    
   
    
     
      
     
     
      
       ids, 
      
    
   
    
     
      
     
     
      
       partition_strategy="mod", 
      
    
   
    
     
      
     
     
      
       name=None, 
      
    
   
    
     
      
     
     
      
       validate_indices=True, # pylint: disable=unused-argument 
      
    
   
    
     
      
     
     
      
       max_norm=None): 
      
    
   
    
     
      
     
     
      
       """Looks up `ids` in a list of embedding tensors. 
      
    
   
    
     
      
     
     
      
       This function is used to perform parallel lookups on the list of 
      
    
   
    
     
      
     
     
      
       tensors in `params`. It is a generalization of 
      
    
   
    
     
      
     
     
      
       @{tf.gather}, 
      where ` 
      params` 
      is 
      
    
   
    
     
      
     
     
     
       interpreted 
      as a partitioning of a large embedding tensor. ` 
      params` may be 
      
    
   
    
     
      
     
     
     
       a `PartitionedVariable` 
      as returned 
      by 
      using `tf.get_variable()` with a 
      
    
   
    
     
      
     
     
     
       partitioner. 
      
    
   
    
     
      
     
     
     
       If `len( 
      params) > 
      1`, each element `id` of `ids` 
      is partitioned between 
      
    
   
    
     
      
     
     
     
       the elements of ` 
      params` according to the `partition_strategy`. 
      
    
   
    
     
      
     
     
     
       In all strategies, 
      if the id space does not evenly divide the number of 
      
    
   
    
     
      
     
     
     
       partitions, each of the first `(max_id + 
      1) % len( 
      params)` partitions will 
      
    
   
    
     
      
     
     
     
       be assigned one more id. 
      
    
   
    
     
      
     
     
     
       If `partition_strategy` 
      is ` 
      "mod"`, we assign each id to partition 
      
    
   
    
     
      
     
     
     
       `p = id % len( 
      params)`. For instance, 
      
    
   
    
     
      
     
     
      
      13 ids are split across 
      5 partitions 
      as: 
      
    
   
    
     
      
     
     
     
       `[[ 
      0, 
      5, 
      10], [ 
      1, 
      6, 
      11], [ 
      2, 
      7, 
      12], [ 
      3, 
      8], [ 
      4, 
      9]]` 
      
    
   
    
     
      
     
     
     
       If `partition_strategy` 
      is ` 
      "div"`, we assign ids to partitions 
      in a 
      
    
   
    
     
      
     
     
     
       contiguous manner. In 
      this 
      case, 
      13 ids are split across 
      5 partitions 
      as: 
      
    
   
    
     
      
     
     
     
       `[[ 
      0, 
      1, 
      2], [ 
      3, 
      4, 
      5], [ 
      6, 
      7, 
      8], [ 
      9, 
      10], [ 
      11, 
      12]]` 
      
    
   
    
     
      
     
     
     
       The results of the lookup are concatenated 
      into a dense 
      
    
   
    
     
      
     
     
     
       tensor. The returned tensor has shape `shape(ids) + shape( 
      params)[ 
      1:]`.

看到 The results of the lookup are concatenated into a dense tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.，即假如params是:100*28,sp_ids是[2,56,3] 那麼返回的便是3*28即分別對應params的第3、57、4行

其實往下看會發現其主要調用的是 _embedding_lookup_and_transform函數

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

來重點看下tf.nn.nce_loss源碼（這也是本demo中最核心的東西）：

源碼https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn_impl.py

  
  
   
    
     
      
     
     
     
       def nce_loss(weights, 
      
    
   
    
     
      
     
     
     
       biases, 
      
    
   
    
     
      
     
     
     
       labels, 
      
    
   
    
     
      
     
     
     
       inputs, 
      
    
   
    
     
      
     
     
     
       num_sampled, 
      
    
   
    
     
      
     
     
     
       num_classes, 
      
    
   
    
     
      
     
     
     
       num_true=1, 
      
    
   
    
     
      
     
     
     
       sampled_values=None, 
      
    
   
    
     
      
     
     
     
       remove_accidental_hits=False, 
      
    
   
    
     
      
     
     
     
       partition_strategy="mod", 
      
    
   
    
     
      
     
     
     
       name="nce_loss"): 
      
    
   
    
     
      
     
     
     
       """Computes and returns the noise-contrastive estimation training loss. 
      
    
   
    
     
      
     
     
     
       See [Noise-contrastive estimation: A new estimation principle for 
      
    
   
    
     
      
     
     
     
       unnormalized statistical 
      
    
   
    
     
      
     
     
     
       models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). 
      
    
   
    
     
      
     
     
     
       Also see our [Candidate Sampling Algorithms 
      
    
   
    
     
      
     
     
     
       Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf) 
      
    
   
    
     
      
     
     
     
       A common 
      use 
      case 
      is 
      to 
      use this method 
      for training, 
      and calculate the 
      full 
      
    
   
    
     
      
     
     
     
       sigmoid loss 
      for evaluation 
      or inference. 
      In this 
      case, you must 
      set 
      
    
   
    
     
      
     
     
      
      `partition_strategy="div"` 
      for the two losses 
      to be 
      consistent, 
      as 
      in the 
      
    
   
    
     
      
     
     
      
      following example: 
      
    
   
    
     
      
     
     
      
      `` 
      `python 
      
    
   
    
     
      
     
     
      
       if mode == "train": 
      
    
   
    
     
      
     
     
      
       loss = tf.nn.nce_loss( 
      
    
   
    
     
      
     
     
      
       weights=weights, 
      
    
   
    
     
      
     
     
      
       biases=biases, 
      
    
   
    
     
      
     
     
      
       labels=labels, 
      
    
   
    
     
      
     
     
      
       inputs=inputs, 
      
    
   
    
     
      
     
     
      
       ..., 
      
    
   
    
     
      
     
     
      
       partition_strategy="div") 
      
    
   
    
     
      
     
     
      
       elif mode == "eval": 
      
    
   
    
     
      
     
     
      
       logits = tf.matmul(inputs, tf.transpose(weights)) 
      
    
   
    
     
      
     
     
      
       logits = tf.nn.bias_add(logits, biases) 
      
    
   
    
     
      
     
     
      
       labels_one_hot = tf.one_hot(labels, n_classes) 
      
    
   
    
     
      
     
     
      
       loss = tf.nn.sigmoid_cross_entropy_with_logits( 
      
    
   
    
     
      
     
     
      
       labels=labels_one_hot, 
      
    
   
    
     
      
     
     
      
       logits=logits) 
      
    
   
    
     
      
     
     
      
       loss = tf.reduce_sum(loss, axis=1) 
      
    
   
    
     
      
     
     
      
       ` 
      `` 
      
    
   
    
     
      
     
     
     
       Note: 
      By 
      default this uses a 
      log- 
      uniform (Zipfian) distribution 
      for sampling, 
      
    
   
    
     
      
     
     
     
       so your labels must be sorted 
      in 
      order 
      of decreasing frequency 
      to achieve 
      
    
   
    
     
      
     
     
     
       good results. 
      For more details, see 
      
    
   
    
     
      
     
     
     
       @{tf.nn.log_uniform_candidate_sampler}. 
      
    
   
    
     
      
     
     
     
       Note: 
      In the 
      case 
      where 
      `num_true` > 
      1, we assign 
      to 
      each target 
      class 
      
    
   
    
     
      
     
     
     
       the target probability 
      1 / 
      `num_true` so that the target probabilities 
      
    
   
    
     
      
     
     
      
      sum 
      to 
      1 per-example. 
      
    
   
    
     
      
     
     
     
       Note: It would be useful 
      to 
      allow a 
      variable 
      number 
      of target classes per 
      
    
   
    
     
      
     
     
     
       example. We hope 
      to provide this functionality 
      in a future release. 
      
    
   
    
     
      
     
     
      
      For 
      now, 
      if you have a 
      variable 
      number 
      of target classes, you can 
      pad them 
      
    
   
    
     
      
     
     
      
      out 
      to a 
      constant 
      number 
      by either repeating them 
      or 
      by padding 
      
    
   
    
     
      
     
     
      
      with an otherwise 
      unused class. 
      
    
   
    
     
      
     
     
     
       Args: 
      
    
   
    
     
      
     
     
     
       weights: A 
      `Tensor` 
      of shape 
      `[num_classes, dim]`, 
      or a 
      list 
      of 
      `Tensor` 
      
    
   
    
     
      
     
     
     
       objects whose concatenation along 
      dimension 
      0 has shape 
      
    
   
    
     
      
     
     
     
       [num_classes, dim]. The (possibly-partitioned) 
      class embeddings. 
      
    
   
    
     
      
     
     
     
       biases: A 
      `Tensor` 
      of shape 
      `[num_classes]`. The 
      class biases. 
      
    
   
    
     
      
     
     
     
       labels: A 
      `Tensor` 
      of 
      type 
      `int64` 
      and shape 
      `[batch_size, 
      
    
   
    
     
      
     
     
      
       num_true]`. The target classes. 
      
    
   
    
     
      
     
     
     
       inputs: A 
      `Tensor` 
      of shape 
      `[batch_size, dim]`. The forward 
      
    
   
    
     
      
     
     
     
       activations 
      of the 
      input network. 
      
    
   
    
     
      
     
     
     
       num_sampled: An 
      `int`. The 
      number 
      of classes 
      to randomly 
      sample per batch. 
      
    
   
    
     
      
     
     
     
       num_classes: An 
      `int`. The 
      number 
      of possible classes. 
      
    
   
    
     
      
     
     
     
       num_true: An 
      `int`. The 
      number 
      of target classes per training example. 
      
    
   
    
     
      
     
     
     
       sampled_values: a tuple 
      of ( 
      `sampled_candidates`, 
      `true_expected_count`, 
      
    
   
    
     
      
     
     
      
      `sampled_expected_count`) returned 
      by a 
      `*_candidate_sampler` function. 
      
    
   
    
     
      
     
     
     
       ( 
      if 
      None, we 
      default 
      to 
      `log_uniform_candidate_sampler`) 
      
    
   
    
     
      
     
     
     
       remove_accidental_hits: A 
      `bool`. Whether 
      to remove 
      "accidental hits" 
      
    
   
    
     
      
     
     
      
      where a sampled 
      class equals one 
      of the target classes. 
      If 
      set 
      to 
      
    
   
    
     
      
     
     
      
      `True`, this 
      is a 
      "Sampled Logistic" loss instead 
      of NCE, 
      and we 
      are 
      
    
   
    
     
      
     
     
     
       learning 
      to generate 
      log-odds instead 
      of 
      log probabilities. See 
      
    
   
    
     
      
     
     
     
       our [Candidate Sampling Algorithms 
      Reference] 
      
    
   
    
     
      
     
     
     
       (https://www.tensorflow.org/extras/candidate_sampling.pdf). 
      
    
   
    
     
      
     
     
      
      Default 
      is False. 
      
    
   
    
     
      
     
     
     
       partition_strategy: A 
      string specifying the partitioning strategy, relevant 
      
    
   
    
     
      
     
     
      
      if 
      `len(weights) > 1`. Currently 
      `"div"` 
      and 
      `"mod"` 
      are supported. 
      
    
   
    
     
      
     
     
      
      Default 
      is 
      `"mod"`. See 
      `tf.nn.embedding_lookup` 
      for more details. 
      
    
   
    
     
      
     
     
      
      name: A 
      name 
      for the operation (optional). 
      
    
   
    
     
      
     
     
      
      Returns: 
      
    
   
    
     
      
     
     
     
       A 
      `batch_size` 
      1-D tensor 
      of per-example NCE losses. 
      
    
   
    
     
      
     
     
      
      """ 
      
    
   
    
     
      
     
     
      
       logits, labels = _compute_sampled_logits( 
      
    
   
    
     
      
     
     
      
       weights=weights, 
      
    
   
    
     
      
     
     
      
       biases=biases, 
      
    
   
    
     
      
     
     
      
       labels=labels, 
      
    
   
    
     
      
     
     
      
       inputs=inputs, 
      
    
   
    
     
      
     
     
      
       num_sampled=num_sampled, 
      
    
   
    
     
      
     
     
      
       num_classes=num_classes, 
      
    
   
    
     
      
     
     
      
       num_true=num_true, 
      
    
   
    
     
      
     
     
      
       sampled_values=sampled_values, 
      
    
   
    
     
      
     
     
      
       subtract_log_q=True, 
      
    
   
    
     
      
     
     
      
       remove_accidental_hits=remove_accidental_hits, 
      
    
   
    
     
      
     
     
      
       partition_strategy=partition_strategy, 
      
    
   
    
     
      
     
     
      
       name=name) 
      
    
   
    
     
      
     
     
      
       sampled_losses = sigmoid_cross_entropy_with_logits( 
      
    
   
    
     
      
     
     
      
       labels=labels, logits=logits, name="sampled_losses 
      ") 
      
    
   
    
     
      
     
     
      
       # sampled_losses is batch_size x {true_loss, sampled_losses...} 
      
    
   
    
     
      
     
     
      
       # We sum out true and sampled losses. 
      
    
   
    
     
      
     
     
      
       return _sum_rows(sampled_losses)

首先來看一下API:

  
  
   
    
     
      
     
     
      
      def nce_loss(weights, 
      
    
   
    
     
      
     
     
      
                   biases, 
      
    
   
    
     
      
     
     
      
                   labels, 
      
    
   
    
     
      
     
     
      
                   inputs, 
      
    
   
    
     
      
     
     
      
                   num_sampled, 
      
    
   
    
     
      
     
     
      
                   num_classes, 
      
    
   
    
     
      
     
     
      
                   num_true=1, 
      
    
   
    
     
      
     
     
      
                   sampled_values=None, 
      
    
   
    
     
      
     
     
      
                   remove_accidental_hits=False, 
      
    
   
    
     
      
     
     
      
                   partition_strategy="mod", 
      
    
   
    
     
      
     
     
      
                   name="nce_loss"):

假如現在輸入數據是M*N（對應到我們這個demo就是說M=50000(詞典單詞數),N=128（word2vec的特徵數））

那麼：

weights：M*N

biases : N

labels : batch_size, num_true(num_true代表正樣本的數量，本demo中爲1)

inputs : batch_size *N

num_sampled：採樣的負樣本

num_classes ： M

sampled_values：是否用不同的採樣器，即tuple(`sampled_candidates`, `true_expected_count` `sampled_expected_count`)

如果是None，這採用log_uniform_candidate_sampler

remove_accidental_hits:如果不下心採集到的負樣本就是target，要不要捨棄

partition_strategy:並行策略問題。

再看一下返回的就是

一個batch_size內每一個類子的NCE losses

下面看一下其實現，主要由三部分構成：

_compute_sampled_logits-----------------------採樣

sigmoid_cross_entropy_with_logits---------------------------logistic regression

_sum_rows------------------------------------------------------------求和。

（1）看一下_compute_sampled_logits

  
  
   
    
     
      
     
     
      
      def _compute_sampled_logits(weights, 
      
    
   
    
     
      
     
     
      
       biases, 
      
    
   
    
     
      
     
     
      
       labels, 
      
    
   
    
     
      
     
     
      
       inputs, 
      
    
   
    
     
      
     
     
      
       num_sampled, 
      
    
   
    
     
      
     
     
      
       num_classes, 
      
    
   
    
     
      
     
     
      
       num_true=1, 
      
    
   
    
     
      
     
     
      
       sampled_values=None, 
      
    
   
    
     
      
     
     
      
       subtract_log_q=True, 
      
    
   
    
     
      
     
     
      
       remove_accidental_hits=False, 
      
    
   
    
     
      
     
     
      
       partition_strategy="mod", 
      
    
   
    
     
      
     
     
      
       name=None, 
      
    
   
    
     
      
     
     
      
       seed=None): 
      
    
   
    
     
      
     
     
      
      """Helper function for nce_loss and sampled_softmax_loss functions. 
      
    
   
    
     
      
     
     
      
       Computes sampled output training logits and labels suitable for implementing 
      
    
   
    
     
      
     
     
      
       e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see 
      
    
   
    
     
      
     
     
      
       sampled_softmax_loss). 
      
    
   
    
     
      
     
     
      
       Note: In the case where num_true > 1, we assign to each target class 
      
    
   
    
     
      
     
     
      
       the target probability 1 / num_true so that the target probabilities 
      
    
   
    
     
      
     
     
      
       sum to 1 per-example. 
      
    
   
    
     
      
     
     
      
       Args: 
      
    
   
    
     
      
     
     
      
       weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` 
      
    
   
    
     
      
     
     
      
       objects whose concatenation along dimension 0 has shape 
      
    
   
    
     
      
     
     
      
       `[num_classes, dim]`. The (possibly-partitioned) class embeddings. 
      
    
   
    
     
      
     
     
      
       biases: A `Tensor` of shape `[num_classes]`. The (possibly-partitioned) 
      
    
   
    
     
      
     
     
      
       class biases. 
      
    
   
    
     
      
     
     
      
       labels: A `Tensor` of type `int64` and shape `[batch_size, 
      
    
   
    
     
      
     
     
      
       num_true]`. The target classes. Note that this format differs from 
      
    
   
    
     
      
     
     
      
       the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`. 
      
    
   
    
     
      
     
     
      
       inputs: A `Tensor` of shape `[batch_size, dim]`. The forward 
      
    
   
    
     
      
     
     
      
       activations of the input network. 
      
    
   
    
     
      
     
     
      
       num_sampled: An `int`. The number of classes to randomly sample per batch. 
      
    
   
    
     
      
     
     
      
       num_classes: An `int`. The number of possible classes. 
      
    
   
    
     
      
     
     
      
       num_true: An `int`. The number of target classes per training example. 
      
    
   
    
     
      
     
     
      
       sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, 
      
    
   
    
     
      
     
     
      
       `sampled_expected_count`) returned by a `*_candidate_sampler` function. 
      
    
   
    
     
      
     
     
      
       (if None, we default to `log_uniform_candidate_sampler`) 
      
    
   
    
     
      
     
     
      
       subtract_log_q: A `bool`. whether to subtract the log expected count of 
      
    
   
    
     
      
     
     
      
       the labels in the sample to get the logits of the true labels. 
      
    
   
    
     
      
     
     
      
       Default is True. Turn off for Negative Sampling. 
      
    
   
    
     
      
     
     
      
       remove_accidental_hits: A `bool`. whether to remove "accidental hits" 
      
    
   
    
     
      
     
     
      
       where a sampled class equals one of the target classes. Default is 
      
    
   
    
     
      
     
     
      
       False. 
      
    
   
    
     
      
     
     
      
       partition_strategy: A string specifying the partitioning strategy, relevant 
      
    
   
    
     
      
     
     
      
       if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. 
      
    
   
    
     
      
     
     
      
       Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. 
      
    
   
    
     
      
     
     
      
       name: A name for the operation (optional). 
      
    
   
    
     
      
     
     
      
       seed: random seed for candidate sampling. Default to None, which doesn't set 
      
    
   
    
     
      
     
     
      
       the op-level random seed for candidate sampling. 
      
    
   
    
     
      
     
     
      
       Returns: 
      
    
   
    
     
      
     
     
      
       out_logits: `Tensor` object with shape 
      
    
   
    
     
      
     
     
      
       `[batch_size, num_true + num_sampled]`, for passing to either 
      
    
   
    
     
      
     
     
      
       `nn.sigmoid_cross_entropy_with_logits` (NCE) or 
      
    
   
    
     
      
     
     
      
       `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax). 
      
    
   
    
     
      
     
     
      
       out_labels: A Tensor object with the same shape as `out_logits`. 
      
    
   
    
     
      
     
     
      
       """ 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      if isinstance(weights, variables.PartitionedVariable): 
      
    
   
    
     
      
     
     
     
       weights = list(weights) 
      
    
   
    
     
      
     
     
      
      if 
      not isinstance(weights, list): 
      
    
   
    
     
      
     
     
     
       weights = [weights] 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      with ops.name_scope(name, 
      "compute_sampled_logits", 
      
    
   
    
     
      
     
     
     
       weights + [biases, inputs, labels]): 
      
    
   
    
     
      
     
     
      
      if labels.dtype != dtypes.int64: 
      
    
   
    
     
      
     
     
     
       labels = math_ops.cast(labels, dtypes.int64) 
      
    
   
    
     
      
     
     
     
       labels_flat = array_ops.reshape(labels, [ 
      -1]) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Sample the negative labels. 
      
    
   
    
     
      
     
     
      
      # sampled shape: [num_sampled] tensor 
      
    
   
    
     
      
     
     
      
      # true_expected_count shape = [batch_size, 1] tensor 
      
    
   
    
     
      
     
     
      
      # sampled_expected_count shape = [num_sampled] tensor 
      
    
   
    
     
      
     
     
      
      if sampled_values 
      is 
      None: 
      
    
   
    
     
      
     
     
     
       sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( 
      
    
   
    
     
      
     
     
     
       true_classes=labels, 
      
    
   
    
     
      
     
     
     
       num_true=num_true, 
      
    
   
    
     
      
     
     
     
       num_sampled=num_sampled, 
      
    
   
    
     
      
     
     
     
       unique= 
      True, 
      
    
   
    
     
      
     
     
     
       range_max=num_classes, 
      
    
   
    
     
      
     
     
     
       seed=seed) 
      
    
   
    
     
      
     
     
      
      # NOTE: pylint cannot tell that 'sampled_values' is a sequence 
      
    
   
    
     
      
     
     
      
      # pylint: disable=unpacking-non-sequence 
      
    
   
    
     
      
     
     
     
       sampled, true_expected_count, sampled_expected_count = ( 
      
    
   
    
     
      
     
     
     
       array_ops.stop_gradient(s) 
      for s 
      in sampled_values) 
      
    
   
    
     
      
     
     
      
      # pylint: enable=unpacking-non-sequence 
      
    
   
    
     
      
     
     
     
       sampled = math_ops.cast(sampled, dtypes.int64) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # labels_flat is a [batch_size * num_true] tensor 
      
    
   
    
     
      
     
     
      
      # sampled is a [num_sampled] int tensor 
      
    
   
    
     
      
     
     
     
       all_ids = array_ops.concat([labels_flat, sampled], 
      0) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Retrieve the true weights and the logits of the sampled weights. 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # weights shape is [num_classes, dim] 
      
    
   
    
     
      
     
     
     
       all_w = embedding_ops.embedding_lookup( 
      
    
   
    
     
      
     
     
     
       weights, all_ids, partition_strategy=partition_strategy) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # true_w shape is [batch_size * num_true, dim] 
      
    
   
    
     
      
     
     
     
       true_w = array_ops.slice(all_w, [ 
      0, 
      0], 
      
    
   
    
     
      
     
     
     
       array_ops.stack( 
      
    
   
    
     
      
     
     
     
       [array_ops.shape(labels_flat)[ 
      0], 
      -1])) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
     
       sampled_w = array_ops.slice( 
      
    
   
    
     
      
     
     
     
       all_w, array_ops.stack([array_ops.shape(labels_flat)[ 
      0], 
      0]), [ 
      -1, 
      -1]) 
      
    
   
    
     
      
     
     
      
      # inputs has shape [batch_size, dim] 
      
    
   
    
     
      
     
     
      
      # sampled_w has shape [num_sampled, dim] 
      
    
   
    
     
      
     
     
      
      # Apply X*W', which yields [batch_size, num_sampled] 
      
    
   
    
     
      
     
     
     
       sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b= 
      True) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Retrieve the true and sampled biases, compute the true logits, and 
      
    
   
    
     
      
     
     
      
      # add the biases to the true and sampled logits. 
      
    
   
    
     
      
     
     
     
       all_b = embedding_ops.embedding_lookup( 
      
    
   
    
     
      
     
     
     
       biases, all_ids, partition_strategy=partition_strategy) 
      
    
   
    
     
      
     
     
      
      # true_b is a [batch_size * num_true] tensor 
      
    
   
    
     
      
     
     
      
      # sampled_b is a [num_sampled] float tensor 
      
    
   
    
     
      
     
     
     
       true_b = array_ops.slice(all_b, [ 
      0], array_ops.shape(labels_flat)) 
      
    
   
    
     
      
     
     
     
       sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [ 
      -1]) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # inputs shape is [batch_size, dim] 
      
    
   
    
     
      
     
     
      
      # true_w shape is [batch_size * num_true, dim] 
      
    
   
    
     
      
     
     
      
      # row_wise_dots is [batch_size, num_true, dim] 
      
    
   
    
     
      
     
     
     
       dim = array_ops.shape(true_w)[ 
      1: 
      2] 
      
    
   
    
     
      
     
     
     
       new_true_w_shape = array_ops.concat([[ 
      -1, num_true], dim], 
      0) 
      
    
   
    
     
      
     
     
     
       row_wise_dots = math_ops.multiply( 
      
    
   
    
     
      
     
     
     
       array_ops.expand_dims(inputs, 
      1), 
      
    
   
    
     
      
     
     
     
       array_ops.reshape(true_w, new_true_w_shape)) 
      
    
   
    
     
      
     
     
      
      # We want the row-wise dot plus biases which yields a 
      
    
   
    
     
      
     
     
      
      # [batch_size, num_true] tensor of true_logits. 
      
    
   
    
     
      
     
     
     
       dots_as_matrix = array_ops.reshape(row_wise_dots, 
      
    
   
    
     
      
     
     
     
       array_ops.concat([[ 
      -1], dim], 
      0)) 
      
    
   
    
     
      
     
     
     
       true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [ 
      -1, num_true]) 
      
    
   
    
     
      
     
     
     
       true_b = array_ops.reshape(true_b, [ 
      -1, num_true]) 
      
    
   
    
     
      
     
     
     
       true_logits += true_b 
      
    
   
    
     
      
     
     
     
       sampled_logits += sampled_b 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      if remove_accidental_hits: 
      
    
   
    
     
      
     
     
     
       acc_hits = candidate_sampling_ops.compute_accidental_hits( 
      
    
   
    
     
      
     
     
     
       labels, sampled, num_true=num_true) 
      
    
   
    
     
      
     
     
     
       acc_indices, acc_ids, acc_weights = acc_hits 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # This is how SparseToDense expects the indices. 
      
    
   
    
     
      
     
     
     
       acc_indices_2d = array_ops.reshape(acc_indices, [ 
      -1, 
      1]) 
      
    
   
    
     
      
     
     
     
       acc_ids_2d_int32 = array_ops.reshape( 
      
    
   
    
     
      
     
     
     
       math_ops.cast(acc_ids, dtypes.int32), [ 
      -1, 
      1]) 
      
    
   
    
     
      
     
     
     
       sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 
      1, 
      
    
   
    
     
      
     
     
      
      "sparse_indices") 
      
    
   
    
     
      
     
     
      
      # Create sampled_logits_shape = [batch_size, num_sampled] 
      
    
   
    
     
      
     
     
     
       sampled_logits_shape = array_ops.concat( 
      
    
   
    
     
      
     
     
     
       [array_ops.shape(labels)[: 
      1], 
      
    
   
    
     
      
     
     
     
       array_ops.expand_dims(num_sampled, 
      0)], 
      0) 
      
    
   
    
     
      
     
     
      
      if sampled_logits.dtype != acc_weights.dtype: 
      
    
   
    
     
      
     
     
     
       acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) 
      
    
   
    
     
      
     
     
     
       sampled_logits += sparse_ops.sparse_to_dense( 
      
    
   
    
     
      
     
     
     
       sparse_indices, 
      
    
   
    
     
      
     
     
     
       sampled_logits_shape, 
      
    
   
    
     
      
     
     
     
       acc_weights, 
      
    
   
    
     
      
     
     
     
       default_value= 
      0.0, 
      
    
   
    
     
      
     
     
     
       validate_indices= 
      False) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      if subtract_log_q: 
      
    
   
    
     
      
     
     
      
      # Subtract log of Q(l), prior probability that l appears in sampled. 
      
    
   
    
     
      
     
     
     
       true_logits -= math_ops.log(true_expected_count) 
      
    
   
    
     
      
     
     
     
       sampled_logits -= math_ops.log(sampled_expected_count) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # Construct output logits and labels. The true labels/logits start at col 0. 
      
    
   
    
     
      
     
     
     
       out_logits = array_ops.concat([true_logits, sampled_logits], 
      1) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # true_logits is a float tensor, ones_like(true_logits) is a float 
      
    
   
    
     
      
     
     
      
      # tensor of ones. We then divide by num_true to ensure the per-example 
      
    
   
    
     
      
     
     
      
      # labels sum to 1.0, i.e. form a proper probability distribution. 
      
    
   
    
     
      
     
     
     
       out_labels = array_ops.concat([ 
      
    
   
    
     
      
     
     
     
       array_ops.ones_like(true_logits) / num_true, 
      
    
   
    
     
      
     
     
     
       array_ops.zeros_like(sampled_logits) 
      
    
   
    
     
      
     
     
     
       ], 
      1) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      return out_logits, out_labels

首先看一下開頭註解的返回維數：

  
  
   
    
     
      
     
     
     
       Returns: 
      
    
   
    
     
      
     
     
     
       out_logits: 
      `Tensor` object 
      with shape 
      
    
   
    
     
      
     
     
      
      `[batch_size, num_true + num_sampled]`, 
      for passing to either 
      
    
   
    
     
      
     
     
      
      `nn.sigmoid_cross_entropy_with_logits` (NCE) or 
      
    
   
    
     
      
     
     
      
      `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax). 
      
    
   
    
     
      
     
     
     
       out_labels: A Tensor object 
      with the same shape 
      as 
      `out_logits`.

即返回的out_logits和 out_labels的維度都是[batch_size, num_true + num_sampled]，其中 num_true + num_sampled代表的就是正樣本數+負樣本數

再看一下最後：

  
  
   
    
     
      
     
     
     
       out_labels = array_ops.concat([ 
      
    
   
    
     
      
     
     
     
       array_ops.ones_like(true_logits) / num_true, 
      
    
   
    
     
      
     
     
     
       array_ops.zeros_like(sampled_logits) 
      
    
   
    
     
      
     
     
     
       ], 1)

其中的array_ops.ones_like和array_ops.zeros_like就是賦值向量爲全1和全0，也可以從下面的源碼看到：

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/array_ops.py

  
  
   
    
     
      
     
     
      
      @tf_export("ones_like") 
      
    
   
    
     
      
     
     
      
      def ones_like(tensor, dtype=None, name=None, optimize=True): 
      
    
   
    
     
      
     
     
      
      """Creates a tensor with all elements set to 1. 
      
    
   
    
     
      
     
     
      
       Given a single tensor (`tensor`), this operation returns a tensor of the same 
      
    
   
    
     
      
     
     
      
       type and shape as `tensor` with all elements set to 1. Optionally, you can 
      
    
   
    
     
      
     
     
      
       specify a new type (`dtype`) for the returned tensor. 
      
    
   
    
     
      
     
     
      
       For example: 
      
    
   
    
     
      
     
     
      
       ```python 
      
    
   
    
     
      
     
     
      
       tensor = tf.constant([[1, 2, 3], [4, 5, 6]]) 
      
    
   
    
     
      
     
     
      
       tf.ones_like(tensor) # [[1, 1, 1], [1, 1, 1]] 
      
    
   
    
     
      
     
     
      
       ``` 
      
    
   
    
     
      
     
     
      
       Args: 
      
    
   
    
     
      
     
     
      
       tensor: A `Tensor`. 
      
    
   
    
     
      
     
     
      
       dtype: A type for the returned `Tensor`. Must be `float32`, `float64`, 
      
    
   
    
     
      
     
     
      
       `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`, 
      
    
   
    
     
      
     
     
      
       `complex64`, `complex128` or `bool`. 
      
    
   
    
     
      
     
     
      
       name: A name for the operation (optional). 
      
    
   
    
     
      
     
     
      
       optimize: if true, attempt to statically determine the shape of 'tensor' 
      
    
   
    
     
      
     
     
      
       and encode it as a constant. 
      
    
   
    
     
      
     
     
      
       Returns: 
      
    
   
    
     
      
     
     
      
       A `Tensor` with all elements set to 1. 
      
    
   
    
     
      
     
     
      
       """ 
      
    
   
    
     
      
     
     
      
      with ops.name_scope(name, 
      "ones_like", [tensor]) 
      as name: 
      
    
   
    
     
      
     
     
     
       tensor = ops.convert_to_tensor(tensor, name= 
      "tensor") 
      
    
   
    
     
      
     
     
     
       ones_shape = shape_internal(tensor, optimize=optimize) 
      
    
   
    
     
      
     
     
      
      if dtype 
      is 
      None: 
      
    
   
    
     
      
     
     
     
       dtype = tensor.dtype 
      
    
   
    
     
      
     
     
     
       ret = ones(ones_shape, dtype=dtype, name=name) 
      
    
   
    
     
      
     
     
      
      if 
      not context.executing_eagerly(): 
      
    
   
    
     
      
     
     
     
       ret.set_shape(tensor.get_shape()) 
      
    
   
    
     
      
     
     
      
      return ret

所以總結一下就是：

out_logits返回的就是目標詞彙

out_labels返回的就是正樣本+負樣本（其中正樣本都標記爲1，負樣本都標記爲0）

這也是負採樣的精髓所在，因爲結果只有兩種結果，所以只做二分類就可以了，代替了之前需要預測整個詞典的大小，比如要對本類子中的50000種結果的每一種都預測，所以減少了計算的複雜度！！！！！！！！！！！！！！！

二者的維度都是[batch_size, num_true + num_sampled]

同時因爲該demo中sampled_values=None，所以

  
  
   
    
     
      
     
     
      
      if sampled_values 
      is 
      None: 
      
    
   
    
     
      
     
     
     
       sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( 
      
    
   
    
     
      
     
     
     
       true_classes=labels, 
      
    
   
    
     
      
     
     
     
       num_true=num_true, 
      
    
   
    
     
      
     
     
     
       num_sampled=num_sampled, 
      
    
   
    
     
      
     
     
     
       unique= 
      True, 
      
    
   
    
     
      
     
     
     
       range_max=num_classes, 
      
    
   
    
     
      
     
     
     
       seed=seed)

用到的是：candidate_sampling_ops.log_uniform_candidate_sampler採樣器

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/candidate_sampling_ops.py

  
  
   
    
     
      
     
     
     
       def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique, 
      
    
   
    
     
      
     
     
     
       range_max, seed=None, name=None): 
      
    
   
    
     
      
     
     
     
       """Samples a 
      set 
      of classes 
      using a 
      log- 
      uniform (Zipfian) base distribution. 
      
    
   
    
     
      
     
     
     
       This operation randomly samples a tensor 
      of sampled classes 
      
    
   
    
     
      
     
     
     
       ( 
      `sampled_candidates`) 
      from the 
      range 
      of integers 
      `[0, range_max)`. 
      
    
   
    
     
      
     
     
     
       The elements 
      of 
      `sampled_candidates` 
      are drawn 
      without replacement 
      
    
   
    
     
      
     
     
     
       ( 
      if 
      `unique=True`) 
      or 
      with replacement ( 
      if 
      `unique=False`) 
      from 
      
    
   
    
     
      
     
     
     
       the base distribution. 
      
    
   
    
     
      
     
     
     
       The base distribution 
      for this operation 
      is an approximately 
      log- 
      uniform 
      
    
   
    
     
      
     
     
      
      or Zipfian distribution: 
      
    
   
    
     
      
     
     
      
      `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` 
      
    
   
    
     
      
     
     
     
       This sampler 
      is useful 
      when the target classes approximately follow such 
      
    
   
    
     
      
     
     
     
       a distribution - 
      for example, 
      if the classes represent words 
      in a lexicon 
      
    
   
    
     
      
     
     
     
       sorted 
      in decreasing 
      order 
      of frequency. 
      If your classes 
      are 
      not ordered 
      by 
      
    
   
    
     
      
     
     
     
       decreasing frequency, 
      do 
      not 
      use this op. 
      
    
   
    
     
      
     
     
      
      In addition, this operation 
      returns tensors 
      `true_expected_count` 
      
    
   
    
     
      
     
     
      
      and 
      `sampled_expected_count` representing the 
      number 
      of times 
      each 
      
    
   
    
     
      
     
     
      
      of the target classes ( 
      `true_classes`) 
      and the sampled 
      
    
   
    
     
      
     
     
     
       classes ( 
      `sampled_candidates`) 
      is expected 
      to occur 
      in an average 
      
    
   
    
     
      
     
     
     
       tensor 
      of sampled classes. These 
      values correspond 
      to 
      `Q(y|x)` 
      
    
   
    
     
      
     
     
     
       defined 
      in [this 
      
    
   
    
     
      
     
     
      
      document]( 
      http://www.tensorflow.org/extras/candidate_sampling.pdf). 
      
    
   
    
     
      
     
     
      
      If 
      `unique=True`, 
      then these 
      are post-rejection probabilities 
      and we 
      
    
   
    
     
      
     
     
      
      compute them approximately. 
      
    
   
    
     
      
     
     
     
       Args: 
      
    
   
    
     
      
     
     
     
       true_classes: A 
      `Tensor` 
      of 
      type 
      `int64` 
      and shape 
      `[batch_size, 
      
    
   
    
     
      
     
     
      
       num_true]`. The target classes. 
      
    
   
    
     
      
     
     
     
       num_true: An 
      `int`. The 
      number 
      of target classes per training example. 
      
    
   
    
     
      
     
     
     
       num_sampled: An 
      `int`. The 
      number 
      of classes 
      to randomly sample. 
      
    
   
    
     
      
     
     
      
      unique: A 
      `bool`. Determines whether all sampled classes 
      in a batch 
      are 
      
    
   
    
     
      
     
     
     
       unique. 
      
    
   
    
     
      
     
     
     
       range_max: An 
      `int`. The 
      number 
      of possible classes. 
      
    
   
    
     
      
     
     
      
      seed: An 
      `int`. An operation-specific seed. 
      Default 
      is 
      0. 
      
    
   
    
     
      
     
     
      
      name: A 
      name 
      for the operation (optional). 
      
    
   
    
     
      
     
     
      
      Returns: 
      
    
   
    
     
      
     
     
     
       sampled_candidates: A tensor 
      of 
      type 
      `int64` 
      and shape 
      `[num_sampled]`. 
      
    
   
    
     
      
     
     
     
       The sampled classes. 
      
    
   
    
     
      
     
     
     
       true_expected_count: A tensor 
      of 
      type 
      `float`. Same shape 
      as 
      
    
   
    
     
      
     
     
      
      `true_classes`. The expected counts 
      under the sampling distribution 
      
    
   
    
     
      
     
     
      
      of 
      each 
      of 
      `true_classes`. 
      
    
   
    
     
      
     
     
     
       sampled_expected_count: A tensor 
      of 
      type 
      `float`. Same shape 
      as 
      
    
   
    
     
      
     
     
      
      `sampled_candidates`. The expected counts 
      under the sampling distribution 
      
    
   
    
     
      
     
     
      
      of 
      each 
      of 
      `sampled_candidates`. 
      
    
   
    
     
      
     
     
      
      """ 
      
    
   
    
     
      
     
     
      
       seed1, seed2 = random_seed.get_seed(seed) 
      
    
   
    
     
      
     
     
      
       return gen_candidate_sampling_ops.log_uniform_candidate_sampler( 
      
    
   
    
     
      
     
     
      
       true_classes, num_true, num_sampled, unique, range_max, seed=seed1, 
      
    
   
    
     
      
     
     
      
       seed2=seed2, name=name)

可以看到其對負樣本是基於以下概率採樣的，之所以不使用詞頻直接作爲概率採用是因爲如果這樣的話，那麼採取的負樣本就都會是哪些高頻詞彙類如：and , of , i 等等，顯然並不好。另一個極端就是使用詞頻的倒數，但是這對英文也沒有代表性，根據mikolov寫的一篇論文，實驗得出的經驗值是 $p(w)=\frac{f(w)^{\frac{3}{4}}}{\sum_{i=1}^{50000}f(w)\frac{3}{4}}$

這裏的話沒有用上面的公式，但是也使得其處於兩個極端之間了：還是可以看出P(class) 是遞減函數，即class越小，P(class)越大，class在本類中代表的是單詞的編號，由（五）可以知道，詞頻越大，編號越小（NUK除外），所以詞頻高的還是容易被作採用作爲負樣本的！

P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)

（2）接下來看一下sigmoid_cross_entropy_with_logits函數

  
  
   
    
     
      
     
     
      
      def sigmoid_cross_entropy_with_logits( # pylint: disable=invalid-name 
      
    
   
    
     
      
     
     
      
       _sentinel=None, 
      
    
   
    
     
      
     
     
      
       labels=None, 
      
    
   
    
     
      
     
     
      
       logits=None, 
      
    
   
    
     
      
     
     
      
       name=None): 
      
    
   
    
     
      
     
     
      
      """Computes sigmoid cross entropy given `logits`. 
      
    
   
    
     
      
     
     
      
       Measures the probability error in discrete classification tasks in which each 
      
    
   
    
     
      
     
     
      
       class is independent and not mutually exclusive. For instance, one could 
      
    
   
    
     
      
     
     
      
       perform multilabel classification where a picture can contain both an elephant 
      
    
   
    
     
      
     
     
      
       and a dog at the same time. 
      
    
   
    
     
      
     
     
      
       For brevity, let `x = logits`, `z = labels`. The logistic loss is 
      
    
   
    
     
      
     
     
      
       z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) 
      
    
   
    
     
      
     
     
      
       = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) 
      
    
   
    
     
      
     
     
      
       = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) 
      
    
   
    
     
      
     
     
      
       = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
       = (1 - z) * x + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
       = x - x * z + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
       For x < 0, to avoid overflow in exp(-x), we reformulate the above 
      
    
   
    
     
      
     
     
      
       x - x * z + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
       = log(exp(x)) - x * z + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
       = - x * z + log(1 + exp(x)) 
      
    
   
    
     
      
     
     
      
       Hence, to ensure stability and avoid overflow, the implementation uses this 
      
    
   
    
     
      
     
     
      
       equivalent formulation 
      
    
   
    
     
      
     
     
      
       max(x, 0) - x * z + log(1 + exp(-abs(x))) 
      
    
   
    
     
      
     
     
      
       `logits` and `labels` must have the same type and shape. 
      
    
   
    
     
      
     
     
      
       Args: 
      
    
   
    
     
      
     
     
      
       _sentinel: Used to prevent positional parameters. Internal, do not use. 
      
    
   
    
     
      
     
     
      
       labels: A `Tensor` of the same type and shape as `logits`. 
      
    
   
    
     
      
     
     
      
       logits: A `Tensor` of type `float32` or `float64`. 
      
    
   
    
     
      
     
     
      
       name: A name for the operation (optional). 
      
    
   
    
     
      
     
     
      
       Returns: 
      
    
   
    
     
      
     
     
      
       A `Tensor` of the same shape as `logits` with the componentwise 
      
    
   
    
     
      
     
     
      
       logistic losses. 
      
    
   
    
     
      
     
     
      
       Raises: 
      
    
   
    
     
      
     
     
      
       ValueError: If `logits` and `labels` do not have the same shape. 
      
    
   
    
     
      
     
     
      
       """ 
      
    
   
    
     
      
     
     
      
      # pylint: disable=protected-access 
      
    
   
    
     
      
     
     
     
       nn_ops._ensure_xent_args( 
      "sigmoid_cross_entropy_with_logits", _sentinel, 
      
    
   
    
     
      
     
     
     
       labels, logits) 
      
    
   
    
     
      
     
     
      
      # pylint: enable=protected-access 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      with ops.name_scope(name, 
      "logistic_loss", [logits, labels]) 
      as name: 
      
    
   
    
     
      
     
     
     
       logits = ops.convert_to_tensor(logits, name= 
      "logits") 
      
    
   
    
     
      
     
     
     
       labels = ops.convert_to_tensor(labels, name= 
      "labels") 
      
    
   
    
     
      
     
     
      
      try: 
      
    
   
    
     
      
     
     
     
       labels.get_shape().merge_with(logits.get_shape()) 
      
    
   
    
     
      
     
     
      
      except ValueError: 
      
    
   
    
     
      
     
     
      
      raise ValueError( 
      "logits and labels must have the same shape (%s vs %s)" % 
      
    
   
    
     
      
     
     
     
       (logits.get_shape(), labels.get_shape())) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # The logistic loss formula from above is 
      
    
   
    
     
      
     
     
      
      # x - x * z + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
      # For x < 0, a more numerically stable formula is 
      
    
   
    
     
      
     
     
      
      # -x * z + log(1 + exp(x)) 
      
    
   
    
     
      
     
     
      
      # Note that these two expressions can be combined into the following: 
      
    
   
    
     
      
     
     
      
      # max(x, 0) - x * z + log(1 + exp(-abs(x))) 
      
    
   
    
     
      
     
     
      
      # To allow computing gradients at zero, we define custom versions of max and 
      
    
   
    
     
      
     
     
      
      # abs functions. 
      
    
   
    
     
      
     
     
     
       zeros = array_ops.zeros_like(logits, dtype=logits.dtype) 
      
    
   
    
     
      
     
     
     
       cond = (logits >= zeros) 
      
    
   
    
     
      
     
     
     
       relu_logits = array_ops.where(cond, logits, zeros) 
      
    
   
    
     
      
     
     
     
       neg_abs_logits = array_ops.where(cond, -logits, logits) 
      
    
   
    
     
      
     
     
      
      return math_ops.add( 
      
    
   
    
     
      
     
     
     
       relu_logits - logits * labels, 
      
    
   
    
     
      
     
     
      
       labels: A `Tensor` of the same type and shape as `logits`. 
      
    
   
    
     
      
     
     
      
       logits: A `Tensor` of type `float32` or `float64`. 
      
    
   
    
     
      
     
     
      
       name: A name for the operation (optional). 
      
    
   
    
     
      
     
     
      
       Returns: 
      
    
   
    
     
      
     
     
      
       A `Tensor` of the same shape as `logits` with the componentwise 
      
    
   
    
     
      
     
     
      
       logistic losses. 
      
    
   
    
     
      
     
     
      
       Raises: 
      
    
   
    
     
      
     
     
      
       ValueError: If `logits` and `labels` do not have the same shape. 
      
    
   
    
     
      
     
     
      
       """ 
      
    
   
    
     
      
     
     
      
      # pylint: disable=protected-access 
      
    
   
    
     
      
     
     
     
       nn_ops._ensure_xent_args( 
      "sigmoid_cross_entropy_with_logits", _sentinel, 
      
    
   
    
     
      
     
     
     
       labels, logits) 
      
    
   
    
     
      
     
     
      
      # pylint: enable=protected-access 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      with ops.name_scope(name, 
      "logistic_loss", [logits, labels]) 
      as name: 
      
    
   
    
     
      
     
     
     
       logits = ops.convert_to_tensor(logits, name= 
      "logits") 
      
    
   
    
     
      
     
     
     
       labels = ops.convert_to_tensor(labels, name= 
      "labels") 
      
    
   
    
     
      
     
     
      
      try: 
      
    
   
    
     
      
     
     
     
       labels.get_shape().merge_with(logits.get_shape()) 
      
    
   
    
     
      
     
     
      
      except ValueError: 
      
    
   
    
     
      
     
     
      
      raise ValueError( 
      "logits and labels must have the same shape (%s vs %s)" % 
      
    
   
    
     
      
     
     
     
       (logits.get_shape(), labels.get_shape())) 
      
    
   
    
     
      
     
     
      
      
    
   
    
     
      
     
     
      
      # The logistic loss formula from above is 
      
    
   
    
     
      
     
     
      
      # x - x * z + log(1 + exp(-x)) 
      
    
   
    
     
      
     
     
      
      # For x < 0, a more numerically stable formula is 
      
    
   
    
     
      
     
     
      
      # -x * z + log(1 + exp(x)) 
      
    
   
    
     
      
     
     
      
      # Note that these two expressions can be combined into the following: 
      
    
   
    
     
      
     
     
      
      # max(x, 0) - x * z + log(1 + exp(-abs(x))) 
      
    
   
    
     
      
     
     
      
      # To allow computing gradients at zero, we define custom versions of max and 
      
    
   
    
     
      
     
     
      
      # abs functions. 
      
    
   
    
     
      
     
     
     
       zeros = array_ops.zeros_like(logits, dtype=logits.dtype) 
      
    
   
    
     
      
     
     
     
       cond = (logits >= zeros) 
      
    
   
    
     
      
     
     
     
       relu_logits = array_ops.where(cond, logits, zeros) 
      
    
   
    
     
      
     
     
     
       neg_abs_logits = array_ops.where(cond, -logits, logits) 
      
    
   
    
     
      
     
     
      
      return math_ops.add( 
      
    
   
    
     
      
     
     
     
       relu_logits - logits * labels, 
      
    
   
    
     
      
     
     
     
       math_ops.log1p(math_ops.exp(neg_abs_logits)), 
      
    
   
    
     
      
     
     
     
       name=name)