目錄git
廢話很少說,轉化後數據格式和thchs相同,以下:github
A11_0 lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de5 di3 se4 si4 yue4 de5 lin2 luan2 geng4 shi4 lv4 de5 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2
A11_0 data_thchs30/train/A11_0.wav
BAC009S0002W0122 er2 dui4 lou2 shi4 cheng2 jiao1 yi4 zhi4 zuo4 yong4 zui4 da4 de5 xian4 gou4
BAC009S0002W0122 data_aishell/wav/train/S0002/BAC009S0002W0122.wav
一言不合,直接粘貼。
該腳本和生成的數據也能夠去個人GitHub上down下來,代碼地址shell
# ----------------------------------------------------------------------------------------------------- ''' &usage: aishell數據處理,將漢字處理爲拼音,並生成thchs30的數據形式 @author: hongwen sun ''' # ----------------------------------------------------------------------------------------------------- from pypinyin import pinyin, lazy_pinyin, Style import numpy as np import re # ----------------------------------------------------------------------------------------------------- ''' usage: 將aishell漢字標註轉化爲拼音 env: pip install pypinyin ''' # ----------------------------------------------------------------------------------------------------- def trans_aishell_to_pinyin(word_path, pinyin_path): # 須要轉換爲拼音的中文漢字路徑 textobj = open(word_path, 'r+', encoding='UTF-8') # 轉化爲拼音後的保存txt路徑 savefile = open(pinyin_path, 'w+', encoding='UTF-8') # 對aishell進行文本數據處理 for x in textobj.readlines(): textlabel = x.strip('\n') textlabel = textlabel.split(' ') x = pinyin(textlabel,style=Style.TONE3) str2 = '' for i in x: str1 = " ".join(i) if (re.search(r'\d',str1)): pass else: str1 += '5' str2 = str2 + str1 + ' ' str2 = str2[:-1] # 保存生成的數據 savefile.write(str2 + "\n") # ----------------------------------------------------------------------------------------------------- ''' usage: 生成train, dev, test的音頻文件列表 ''' # ----------------------------------------------------------------------------------------------------- import os def gen_wavlist(wavpath,savefile): fileids = [] fileObject = open(savefile, 'w+', encoding='UTF-8') for (dirpath, dirnames, filenames) in os.walk(wavpath): for filename in filenames: if filename.endswith('.wav'): str1 = '' filepath = os.sep.join([dirpath, filename]) fileid = filename.strip('.wav') str1 = fileid + ' ' + filepath fileObject.write(str1 + '\n') fileObject.close() # ----------------------------------------------------------------------------------------------------- ''' usage: 生成train, dev, test的音頻文件對應的標註文件 ''' # ----------------------------------------------------------------------------------------------------- def gen_label(readfile,writefile): fileids = [] content_dict = {} allfile = open('aishell_transcript.txt','r+', encoding='UTF-8') for textlabel in allfile.readlines(): textlabel = textlabel.strip('\n') textlabel_id = textlabel.split(' ',1)[0] textlabel_text = textlabel.split(' ',1)[1] content_dict[textlabel_id] = textlabel_text listobj = open(readfile, 'r+', encoding='UTF-8') labelobj = open(writefile, 'w+', encoding='UTF-8') for content in listobj.readlines(): label = '' content = content.strip('\n') content_id = content.split(' ',1)[0] if content_id in content_dict: content_text = content_dict[content_id] label = content_id + ' ' + content_text labelobj.write(label+'\n') labelobj.close() allfile.close() listobj.close() # ----------------------------------------------------------------------------------------------------- ''' usage: 修正train, dev, test的音頻文件列表,將標註中不存在的文件刪除 ''' # ----------------------------------------------------------------------------------------------------- def fix_list(listfile,labelfile): fileids = [] content_dict = {} allfile = open(listfile,'r+', encoding='UTF-8') for textlabel in allfile.readlines(): textlabel = textlabel.strip('\n') textlabel_id = textlabel.split(' ',1)[0] textlabel_text = textlabel.split(' ',1)[1] content_dict[textlabel_id] = textlabel_text allfile.truncate() allfile.close() labelobj = open(labelfile, 'r+', encoding='UTF-8') listobj = open(listfile, 'w+', encoding='UTF-8') for content in labelobj.readlines(): label = '' content = content.strip('\n') content_id = content.split(' ',1)[0] content_text = content_dict[content_id] label = content_id + ' ' + content_text listobj.write(label+'\n') labelobj.close() listobj.close() # 將漢字標註化爲拼音標註 # 在data_aishell同級目錄下運行該腳本。 trans_aishell_to_pinyin('E:\\aishell_transcript_v0.8.txt', 'E:\\aishell_transcript1.txt') # 生成train, dev, test的音頻文件列表 gen_wavlist('data_aishell/wav/train','train.wav.lst') gen_wavlist('data_aishell/wav/test','test.wav.lst') gen_wavlist('data_aishell/wav/dev','dev.wav.lst') # 生成train, dev, test的音頻文件對應的標註文件 gen_label('train.wav.lst', 'train.syllable.txt') gen_label('test.wav.lst', 'test.syllable.txt') gen_label('dev.wav.lst', 'dev.syllable.txt') # 修正train, dev, test的音頻文件列表,將標註中不存在的文件刪除 fix_list('train.wav.lst', 'train.syllable.txt') fix_list('test.wav.lst', 'test.syllable.txt') fix_list('dev.wav.lst', 'dev.syllable.txt')
經過這個方法將獲得和thchs30相同的數據格式,能夠用這個變換方法將其餘的中文數據都整合成統一格式,就能夠增長訓練的數據集啦。
固然這也是個人拋轉引玉,但願你們也能有更多的方法將不一樣的數據集給整合起來。網絡
轉載請註明出處:hongwen 的博客ui