腳本註釋3

train_mono.shhtml

#!/bin/bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0


# To be run from ..
# Flat start and monophone training, with delta-delta features.
# This script applies cepstral mean normalization (per speaker).

# Begin configuration section.
#線程數目
nj=4
cmd=run.pl
#相關的尺度縮放因子
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
#迭代次數
num_iters=40    # Number of iterations of training
#迭代到目標高斯數的最大次數
max_iter_inc=30 # Last iter to increase #Gauss on.
#目標總高斯數
totgauss=1000 # Target #Gaussians.
careful=false
#對齊時,提升靜音似然度的程度
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
#對齊的次數列表
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
#配置文件名
config= # name of config file.
stage=-4
#根據出現的次數,經過指數肯定高斯數目
power=0.25 # exponent to determine number of gaussians from occurrence counts
#norm_vars已通過時廢棄
norm_vars=false # deprecated, prefer --cmvn-opts "--norm-vars=false"
#cmvn選項
cmvn_opts=  # can be used to add extra options to cmvn.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
  echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  exit 1;
fi

#data/waves_train
data=$1
#data/lang
lang=$2
#exp/mono0 
dir=$3

#cat data/lang/oov.txt 命令
oov_sym=`cat $lang/oov.int` || exit 1;
#建立exp/mono0/log文件夾
mkdir -p $dir/log
#將線程數寫入 num_jobs文件
echo $nj > $dir/num_jobs

#將數據集進行切分紅多個子文件夾,便於多線程
#split_data.sh $data1 $nj
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

#cp data/lang/phones.txt   exp/mono0/phones.txt
cp $lang/phones.txt $dir || exit 1;

#cmvn選項
$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
#保存cmvn配置
echo $cmvn_opts  > $dir/cmvn_opts # keep track of options to CMVN.


#apply-cmvn  提取特徵的CMVN,即爲倒譜方差均值歸一化
#3個輸入文件:utt2spk(發音id 說話人), cmvn.scp(說話人相關的統計量), feats.scp(訓練用的特徵文件)
#輸出是 ark:-|,利用管道技術把輸出傳遞給下一個函數做爲輸入 
#add-deltas 輸入是ark:-,訓練數據增長差份量,好比13維度mfcc處理後變成39維度
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
#sed命令    將所有的JOB替換成1
example_feats="`echo $feats | sed s/JOB/1/g`";


echo "$0: Initializing monophone system."
#若是不存在音素列表文件,則退出
[ ! -f $lang/phones/sets.int ] && exit 1;
#data/local/phones/set.txt set.int
shared_phones_opt="--shared-phones=$lang/phones/sets.int"

#$stage小於等於-3,則gmm-init-mono
#src/gmmbin/gmm-init-mono
#Usage:  gmm-init-mono <topology-in> <dim> <model-out> <tree-out>
#計算全部特徵數據每一維的全局均值、方差
#讀取hmm-topo文件,根據sets.int,建立ctx_dep
#GMM的均值和方差爲上述全局均值和方差
#初始化轉移模型能夠經過配置文件conf/topo_org.proto設置
#將初始化轉移模型、GMM模型寫入exp/mono0/0.mdl,將ctx_dep寫到tree中,決策樹圖(trains-id葉子節點)
if [ $stage -le -3 ]; then
  # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway.
  #若是特徵維度爲0,則feat-to-dim命令運行獲取特徵維度,忽略掉錯誤提示信息 
  #若是或者$feat_dim爲空;
  if ! feat_dim=`feat-to-dim "$example_feats" - 2>/dev/null` || [ -z $feat_dim ]; then
    #將錯誤顯示出來
    feat-to-dim "$example_feats" -
    echo "error getting feature dimension"
    exit 1;
    #錯誤獲得特徵維度
  fi
  #開始初始化
  #subset-feats  特徵數據中取10個特徵用於構造原始模型
  #輸出 0.mdl 和tree
  $cmd JOB=1 $dir/log/init.log \
    gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \
    $dir/0.mdl $dir/tree || exit 1;
fi

#獲取高斯數  gmm-info 0.mdl 搜索 gaussians,輸出最後1個字段的內容
#awk   NF 字段的個數, $NF表示最後一個字段的內容
numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
#(目標高斯數(外面已傳入) - 高斯數(未知?))/高斯增長的最大迭代次數 = 每次迭代的高斯數增量  
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
echo "目標總高斯數: $totgauss!"
echo "初始化高斯數: $numgauss!"


if [ $stage -le -2 ]; then
  #編譯訓練圖
  #根據tree決策樹,0.mdl模型
  #compile-train-graphs
  #Usage: compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in>  <transcriptions-rspecifier> <graphs-wspecifier>
  #e.g.: compile-train-graphs tree 0.mdl lex.fst ark:train.tra ark:graphs.fsts
  #將字符轉成int,   "cat oov.int"    
  #words.txt   詞彙表  對應int
  #text 訓練集  發音id 文本
  #輸出fst文件(fsts.JOB.gz),包含train.tra中的每一個發音-id的FST,FST由無轉移機率的HCLG組成
  echo "$0: Compiling training graphs"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst \
    "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

#align-equal-compiled
#訓練時須要將標註跟每一幀特徵進行對齊,因爲如今尚未能夠用於對齊的模型,
#因此採用最簡單的方法 -- 均#勻對齊根據標註數目對特徵序列進行等間隔切分,
#例如一個具備5個標註的長度爲100幀的特徵序列,則認爲1-20幀屬於第1個標註,21-40屬於第2個...
#這種劃分方法雖然會有偏差,但待會在訓練模型的過程當中會不斷地從新對齊。



#gmm-acc-stats-ali:根據對齊信息,計算每一個高斯分佈的均值和方差,輸出到0.JOB.acc
if [ $stage -le -1 ]; then
  echo "$0: Aligning data equally (pass 0)"
  $cmd JOB=1:$nj $dir/log/align.0.JOB.log \
    align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:-  \| \
    gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
    $dir/0.JOB.acc || exit 1;
fi

# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
# we fail to est "rare" phones and later on, they never align properly.
#在下面的步驟中,設置最小高斯出現次數閾值,不然,就會出現估計少數音素失敗,它們永遠不會合理的對齊。
#若是某個單高斯Component的occupancy_低於這個閾值,那麼就不會更新這個高斯
## 並且若是 --remove-low-count-gaussians=true,則對應得單高斯Component會被移除。


#更新模型
#Do Maximum Likelihood re-estimation of GMM-based acoustic model
#Usage:  gmm-est [options] <model-in> <stats-in> <model-out>
#e.g.: gmm-est 1.mdl 1.acc 2.mdl
#利用gmm-acc-stats-ali 0.*.acc統計信息, 更新轉移模型,更新GMM模型
if [ $stage -le 0 ]; then
  gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss --power=$power \
    $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1;
  rm $dir/0.*.acc
fi


beam=6 # will change to 10 below after 1st pass
# note: using slightly wider beams for WSJ vs. RM.
x=1

#迭代次數
#steps/train_mono.sh: Pass 6
#steps/train_mono.sh: Aligning data
#gmm-boost-silence 做用是讓某些phones(由第一個參數指定)對應pdf的weight乘以--boost 
#參數所指定的數字,強行提升(若是大於1)/下降(若是小於1)這個phone的機率。
#gmm-align-compiled 解碼對齊:每一幀,對齊的狀態; 根據這句話的fst和特徵,生成對應的對齊狀態序列
#gmm-acc-stats-ali統計信息,用於更新模型
#gmm-est  利用上述信息,更新模型


# 線性增長混合高斯模型的數目,直到指定數量。
#遍歷次數增長


#gmm-align-compiled
# --acoustic-scale 選項跟GMM輸出機率相關,用於平衡 GMM 輸出機率和 HMM 跳轉機率的重要性。
# --beam 選項用於計算對解碼過程當中出現較低log-likelihood的token進行裁剪的閾值,該值設計的越小,大部分token會被裁剪以便提升解碼速度,但可能會在開始階段把正確的token裁剪掉致使沒法獲得正確的解碼路徑。
# --retry-beam 選項用於修正上述的問題,當沒法獲得正確的解碼路徑後,會增長beam的值,若是找到了最佳解碼路徑則退出,不然一直增長指定該選項設置的值,若是還沒找到,就拋出警告,致使這種問題要麼是標註原本就不對,或者retry-beam也設計得過小


while [ $x -lt $num_iters ]; do
  echo "$0: Pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: Aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] --careful=$careful "$mdl" \
        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
        || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
      $dir/$x.JOB.acc || exit 1;

    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
  fi  
  if [ $x -le $max_iter_inc ]; then
     numgauss=$[$numgauss+$incgauss];
  fi
  beam=10
  x=$[$x+1]
done

#清空舊文件
#創建軟連接    
#$x.mdl   final.mdl  最終模型
#$x,occs  final.occs  occupation counts
( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )

#如下作一些分析、診斷工做
steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
utils/summarize_warnings.pl $dir/log     
steps/info/gmm_dir_info.pl $dir          

echo "$0: Done training monophone system in $dir"

exit 0

# example of showing the alignments:
# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4

 

prepare_lang.shexpress

#!/bin/bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#這個腳本準備一個目錄例如 data/lang
#假定一個原文件夾包含一個詞典 lexicon.txt,每行內容格式: 詞 音素 音素 ...
#或者一個包含詞出現機率的詞典    lexiconp.txt,每行內容格式: 詞 機率 音素 音素 ...
#注意,若是lexiconp.txt存在,咱們會使用這個機率詞典,即使lexicon.txt存在.
#silence_phones.txt  靜音音素(各類噪聲、笑聲、咳嗽、有聲停頓); nonsilence_phones.txt 非靜音音素(正常音素); 
#上述這些文件的每行表明一組相同的base phone,包含各類不一樣的重音或者聲調。

#optional_silence.txt 僅僅包含一個單獨的音素,典型的是SIL;
#extra_questions.txt 多是空的,典型的是它包含一些音素以及每一個音素對應的相同的重音或者音調和靜音音速列表.
#它能夠增長自動生成問題的數量(注意:自動產生問題對一個音素的全部變體都同等對待)

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.


# Begin configuration section.
#正常音素的狀態數
#非正常發音的狀態數
#與位置相關的音素
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
#position_dependent_phones 爲false時,位置相關的音素和word_boundary.txt將由另外一種方式產生.
#爲true,將會被拆分紅開始、中間、結束、孤立等位置相關的Phones.  B I E S,構建決策樹時對位置提問進行分裂.


share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
#位於同一行,表示共享 hmm-state 狀態數
#share若是爲true,不一樣靜音音素將會共享pdfs(機率密度),可是轉移機率仍是不一樣的.在
#roots文件中,同一行,若是不共享,放在不一樣行.
#split/not-split,對於根節點,是否有機會根據問題進行決策樹分裂;若是分裂,則同一行的不一樣音素pdf可能不一樣;若是不分裂,則固定共享.


sil_prob=0.5

#若是你想用make_unk_lm.sh,爲未知的詞創建 音素級別的LM.
#須要提供這個unk_fst falg,eg:  <work-dir>/unk_fst.txt.
#<work-dir> 是ake_unk_lm.sh的第二個參數.
unk_fst=        # if you want to model the unknown-word (<oov-dict-entry>)
                # with a phone-level LM as created by make_unk_lm.sh,
                # provide the text-form FST via this flag, e.g. <work-dir>/unk_fst.txt
                # where <work-dir> was the 2nd argument of make_unk_lm.sh.

#音素符號表 phones.txt   音素  int編號
phone_symbol_table=              # if set, use a specified phones.txt file.

#混淆符號 字典中的詞可能會出現同個發音的狀況,在同音詞 發音標註加入 disambig sysmbols(#1 #2 ...)
#有多少個同音詞,有多個disambig sysmbols.? 不太肯定
extra_word_disambig_syms=        # if set, add disambiguation symbols from this file (one per line)
                                 # to phones/disambig.txt, phones/wdisambig.txt and words.txt

#標準的一個混淆符號用於optional silence.
#增長這個混淆符號數目不會有問題,可是有用的當你後面想要引入這個標籤到L_disambig.fst.
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst
# end configuration sections

echo "$0 $@"  # Print the command line for logging

. utils/parse_options.sh

if [ $# -ne 4 ]; then
  echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all non-silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  echo "     --phone-symbol-table <filename>                 # default: \"\"; if not empty, use the provided "
  echo "                                                     # phones.txt as phone symbol table. This is useful "
  echo "                                                     # if you use a new dictionary for the existing setup."
  echo "     --unk-fst <text-fst>                            # default: none.  e.g. exp/make_unk_lm/unk_fst.txt."
  echo "                                                     # This is for if you want to model the unknown word"
  echo "                                                     # via a phone-level LM rather than a special phone"
  echo "                                                     # (this should be more useful for test-time than train-time)."
  echo "     --extra-word-disambig-syms <filename>           # default: \"\"; if not empty, add disambiguation symbols"
  echo "                                                     # from this file (one per line) to phones/disambig.txt,"
  echo "                                                     # phones/wdisambig.txt and words.txt"
  exit 1;
fi

#data/local/dict   
srcdir=$1
#"<SIL>"
oov_word=$2
#data/local/lang
tmpdir=$3
#data/lang
dir=$4
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh



! utils/validate_dict_dir.pl $srcdir && \
  echo "*Error validating directory $srcdir*" && exit 1;

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then
  echo "$0: expected --unk-fst $unk_fst to exist as a file"
  exit 1
fi

if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
  utils/validate_dict_dir.pl $srcdir  # show the output.
  echo "Validation failed (second time)"
  exit 1;
fi

# phones.txt file provided, we will do some sanity check here.
# $phone_symbol_table  就是phones.txt,  與{,non}silence_phones.txt  check,是否有問題
if [[ ! -z $phone_symbol_table ]]; then
  # Checks if we have position dependent phones
  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
  $position_dependent_phones && [ $n1 -eq $n2 ] &&\
    echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
  ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
      echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1;

  # Checks if the phone sets match.
  cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
  { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
      print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
fi

# In case there are extra word-level disambiguation symbols we need
# to make sure that all symbols in the provided file are valid.
#若是有額外的詞級別的混淆符號,咱們須要確認全部符號是有效的   validate_disambig_sym_file.pl
if [ ! -z "$extra_word_disambig_syms" ]; then
  if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then
    echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed."
    exit 1;
  fi
fi

#若是position_dependent_phones有效,則添加 _B, _E, _S, _I 生成$tmpdir/lexiconp.txt,lexiconp_silprob.txt.
#建立phone_map.txt,每行格式:  <original phone> <version 1 of original phone> <version 2> ...
#eg:AA AA_B AA_E AA_I AA_S
if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi

mkdir -p $dir/phones  # various sets of phones...

# Sets of phones for use in clustering, and making monophone systems.
#音素集合用於聚類,生成 mono系統.


#若是共享靜音音素,則
if $share_silence_phones; then
  # build a roots file that will force all the silence phones to share the
  # same pdf's. [three distinct states, only the transitions will differ.]
  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
  # in the same tree-root?
  # Sharing across models(phones) is achieved by writing several phones
  # into one line of roots.txt (shared/not-shared doesn't affect this).
  # 'not-shared not-split' means we have separate tree roots for the 3 states,
  # but we never split the tree so they remain stumps,
  # so all phones in the line correspond to the same model.

  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | \
    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
else
  # different silence phones will have different GMMs.  [note: here, all "shared split" means
  # is that we may have one GMM for all the states, or we can split on states.  because they're
  # context-independent phones, they don't see the context.]
  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
fi



cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
# silence不多在一個詞的中間.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].
# add_lex_disambig.pl 添加歧義符號到Lexicon. 爲了告訴咱們有多少符號被用到,一樣爲了修正 
#unknown-word's 發音成 #1 #2 #3
# 

if [ ! -z "$unk_fst" ]; then  # if the --unk-fst option was provided...
  if "$silprob"; then
    utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1
  else
    utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1
  fi
  unk_opt="--first-allowed-disambig 4"
else
  unk_opt=
fi

if "$silprob"; then
  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig
#$ndisambig  存在 $tmpdir/lex_ndisambig


# Format of lexiconp_disambig.txt:
# !SIL    1.0   SIL_S
# <SPOKEN_NOISE>    1.0   SPN_S #1
# <UNK>    1.0  SPN_S #2
# <NOISE>    1.0  NSN_S
# !EXCLAMATION-POINT    1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# 若是有詞級別的歧義符,號也須要加到音素級別的歧義符表
# In case there are extra word-level disambiguation symbols they also
# need to be added to the list of phone-level disambiguation symbols.
if [ ! -z "$extra_word_disambig_syms" ]; then
  # We expect a file containing valid word-level disambiguation symbols.
  cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt
fi

#建立phones.txt
# Create phone symbol table.
if [[ ! -z $phone_symbol_table ]]; then
  start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
else
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
    awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
fi

#建立一個文件word_boundary.txt,描述詞邊界信息用於每一個音素
# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

#建立詞符號表 words.txt
#<s> and </s>是須要的, 用於arpa語言模型從新評分網格,它們一般不會出現再G.fst和L.fst中
# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;


#
# In case there are extra word-level disambiguation symbols they also
# need to be added to words.txt
#word.txt存在,統計當前詞數目
#將這些歧義符號添加到當前的words.txt,包括符號的整數形式
if [ ! -z "$extra_word_disambig_syms" ]; then
  # Since words.txt already exists, we need to extract the current word count.
  word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'`

  # We expect a file containing valid word-level disambiguation symbols.
  # The list of symbols is attached to the current words.txt (including
  # a numeric identifier for each symbol).
  cat $extra_word_disambig_syms | \
    awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1;
fi

# format of $dir/words.txt:
#<eps> 0
#!EXCLAMATION-POINT 1
#!SIL 2
#"CLOSE-QUOTE 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

#建立align_lexicon.{txt,int}
# 若是咱們沒有使用 word-position-dependent,  咱們使用這種方法用於網格詞對齊
# create $dir/phones/align_lexicon.{txt,int}.
# This is the method we use for lattice word alignment if we are not
# using word-position-dependent phones.

# First remove pron-probs from the lexicon.
#首先去除Lexicon中的機率
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt

#$silphone沒有後綴,由於它是做爲optional-silence出現,而不是一個詞的一部分.
# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
# and is not part of a word.
[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt

#排序,去重
cat $tmpdir/align_lexicon.txt | \
 perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt


# create phones/align_lexicon.int
#轉成int形式
cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int

# Create the basic L.fst without disambiguation symbols, for use
# in training.
# 無需歧義符號,就能夠建立基本的L.fst,在訓練中有用到.


#添加靜音機率 (爲詞的先後出現靜音的機率建模)
#make_lexicon_fst.pl  將詞典中的單詞和音素轉成fst輸入文件的格式.
if $silprob; then
  # Add silence probabilities (modlels the prob. of silence before and after each
  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
  # and where it's called in the example scripts (run.sh).
  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "<eps>" | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \
    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    --keep_isymbols=false --keep_osymbols=false | \
     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
# oov.txt 轉成oov.int
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
# wdisambig.txt包含歧義符號的文本形式,這些符號通過lexicon在語法中用到.
#在這個階段,被硬編成'#0',咱們爲通用性打下基礎


echo '#0' >$dir/phones/wdisambig.txt

# In case there are extra word-level disambiguation symbols they need
# to be added to the existing word-level disambiguation symbols file.
if [ ! -z "$extra_word_disambig_syms" ]; then
  # We expect a file containing valid word-level disambiguation symbols.
  # The regular expression for awk is just a paranoia filter (e.g. for empty lines).
  cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt
fi

#轉成int
utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig context_indep; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

for x in sets extra_questions; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
done

utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
   > $dir/phones/roots.int || exit 1;

if [ -f $dir/phones/word_boundary.txt ]; then
  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
# with another one of your choice if the 'topo' file you want can't be generated by
# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
# should cover all the phones.  Try running utils/validate_lang.pl to check that
# everything is OK after modifying the topo file.
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo


# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.
# fstcompile 將text描述性的fst轉換成二進制形式
# fstdraw能夠將而二進制的fst可視化成圖
# fstaddselfloops可對網絡的起始節點添加自轉條件.
# fstarcsort對每條弧按照規則排序.
if $silprob; then
  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


if [ ! -z "$unk_fst" ]; then
  utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1

  if ! $position_dependent_phones; then
    echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false."
    echo " ... this will make it impossible to properly work out the word boundaries after"
    echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts"
    echo " ... will die."
    sleep 4
  fi
fi

echo "$(basename $0): validating output directory"
! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;

exit 0;

 

run.shapache

#!/bin/bash

train_cmd="utils/run.pl"
decode_cmd="utils/run.pl"

#have data, not need to download
#if [ ! -d waves_yesno ]; then
#  wget http://www.openslr.org/resources/1/waves_yesno.tar.gz || exit 1;
  # was:
  # wget http://sourceforge.net/projects/kaldi/files/waves_yesno.tar.gz || exit 1;
#  tar -xvzf waves_yesno.tar.gz || exit 1;
#fi


train_yesno=waves_train
test_base_name=waves_test

#clear  data exp  mfcc filefolder
rm -rf data exp mfcc

# Data preparation
# we need to rewrite scripts below
local/prepare_data.sh Nestle   #structure of dir and file name is  different
local/prepare_dict.sh               #dict contains 10 words, not 2.
#將詞典轉換成語言模型fst
utils/prepare_lang.sh --position-dependent-phones false  --sil-prob 0.8  --share-silence-phones true  --num-sil-states 5 --num-nonsil-states 4 data/local/dict "<SIL>" data/local/lang data/lang
#測試與診斷語言模型
local/prepare_lm.sh
echo "Data Prepraration finish!"

# Feature extraction
for x in  waves_test waves_train; do 
     steps/make_mfcc.sh --nj 8 data/$x exp/make_mfcc/$x mfcc
     steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
     utils/fix_data_dir.sh data/$x
done
echo "Feature extraction finish!"

# Mono training
steps/train_mono.sh --nj 8 --cmd "$train_cmd" \
  --totgauss  800 \
  data/waves_train data/lang exp/mono0 
echo "Mono training finish!"

  
# Graph compilation    結合語言模型fst和聲學模型;建立徹底的識別網絡
utils/mkgraph.sh data/lang_test_tg exp/mono0 exp/mono0/graph_tgpr
echo "Graph compilation finish!"


# Decoding
steps/decode.sh --nj 1 --cmd "$decode_cmd" \
    exp/mono0/graph_tgpr data/waves_test exp/mono0/decode_waves_test

#xp/mono0/decode_waves_test  wer_* 選取最好的結果
for x in exp/*/decode*; do [ -d $x ] && echo $x && grep WER $x/wer_* | utils/best_wer.sh; done
相關文章
相關標籤/搜索