{
"segmenter":"segment_main.opt",
"base_arpa":"data/base_arpa/base.arpa",
"dict":"data/dictionary/mandarin_cut.dict",
"tools": {
"kenlm": {
"train_arpa_params":{
"bin":"lmplz",
"order":3,
"tmp_dir":"data/tmp",
"mem_rate":"20%",
"discount_fallback":"0.5 1 1.5"
}
},
"srilm": {
"mix_arpa_params":{
"bin":"ngram",
"order":3,
"contain_unk":"-unk"
},
"prune_arpa_params":{
"bin":"ngram",
"order":3,
"contain_unk":"-unk",
"renormalize":"-renorm"
},
"eval_ppl_params":{
"order":3,
"contain_unk":"-unk --renorm",
"debug":2
}
},
"opengram": {
"arpa2fst_params":{
"bin":"ngramread",
"epsilon_symbal":"<eps>",
"vocab_symtbl":"data/vocab/vocab_symtbl.txt",
"contain_unk":"-unk"
}
}lua
},
"pipeline":[
{
"step":1,
"bypass":0,
"dep_list":[],
"operation":"segment",
"description":"segment corpus file to segmented file",
"dict":"",
"file_info":[
{
"input_file":"data/segment/test.cor",
"output_file":"data/segment/test.seg"
}
]
},
{
"step":2,
"bypass":0,
"dep_list":[1],
"operation":"train_arpa",
"description":"train segmented files to arpa file",
"file_info":[
{
"input_files":"data/segment/test.seg",
"output_file":"data/arpa/test.arpa"
}
],
"tool_params": {
"tool":"kenlm",
"params":"train_arpa_params"
}
},
{
"step":3,
"bypass":0,
"dep_list":[2],
"operation":"mix_arpa",
"description":"mix arpa files to one arpa file",
"base_arpa":"data/base_arpa/base.arpa",
"output_file":"data/arpa/mixed.arpa",
"file_info":[
{
"input_files":"data/arpa/test.arpa",
"input_lamda":"0.5"
}
],
"tool":"srilm",
"params":"mix_arpa_params"
},
{
"step":4,
"bypass":0,
"dep_list":[3],
"operation":"prune_arpa",
"description":"prune arpa file smaller by threshold",
"file_info":[
{
"input_file":"data/base_arpa/base.arpa",
"output_file":"data/base_arpa/pruned.arpa"
}
],
"tool":"srilm",
"params":"prune_arpa_params"
},
{
"step":5,
"bypass":0,
"dep_list":[3],
"operation":"arpa_to_fst",
"description":"convert arpa file to fst file",
"file_info":[
{
"input_file":"data/arpa/mixed.arpa",
"output_file":"data/fst/final.fst"
}
],
"tool":"opengram",
"params":"arpa2fst_params"
},
{
"step":6,
"bypass":0,
"dep_list":[5],
"operation":"eval-ppl",
"description":"evaluate arpa file by ppl",
"dev_set":"data/dev_set/test.seg",
"file_info":[
{
"input_file":"data/arpa/mixed.arpa",
"output_file":"data/dev_set/test.seg.ppl"
}
],
"tool":"srilm",
"params":"eval_ppl_params"
}
]
}debug