aishell 拓展数据训练
aishell 拓展数据训练
kaldi 上的aishell s5的例子 有含178小时的训练数据,作出相应的修改是可以运行出模型来的,但是识别效果不是那么好,
所以做了些工作来拓展训练的数据
aishell需要的数据格式
1 音频
根据说话者建目录 同一个录音的人士把他的音频文件都是放在同一个目录下的
2 标注
aishell 的标注比较简单 都写在transcript/aishell_transcript_v0.8.txt里了 格式 filename(不含后缀)空格加上分好词的句子
其他数据源
也是来自openslr
Free ST Chinese Mandarin CorpusSLR38
Primewords Chinese Corpus Set 1SLR47
准备
1 分词代码:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.
"""
convert sentece to wrods and mark pingyin
"""
import os
import os.path
class LexiconDict:
def __init__(self,path):
self.dict = {}
with open(path,'r') as f:
for line in f:
line = line.replace('\n','')
words = line.split(' ')
self.dict[words[0]] = ' '.join(words[1:])
def findKey(self, key):
return key in self.dict
def markerForKey(self,key):
return self.dict[key]
class SentenceToken:
def __init__(self, lexicon):
self.lexicon = lexicon
def cut(self, sentence,result):
offset = 1
max_len = len(sentence)
while(self.lexicon.findKey(sentence[:offset])):
offset = offset + 1
if offset > max_len:
break;
sub = sentence[:offset-1]
if len(sub) > 0:
result['words'].append(sub)
result['phones'].append(self.lexicon.markerForKey(sub))
else:
return
#print(sentence[:offset-1])
if offset > max_len:
return
self.cut(sentence[offset-1:],result)
def parse(self,sentence):
result = {};
result['words'] = []
result['phones'] = []
self.cut(sentence,result)
result['words'] = ' '.join(result['words'])
result['phones'] = ' '.join(result['phones'])
return result
if(__name__=='__main__'):
lexiconpath = '{aishell_data}/lexicon.txt'
work_path = './ST-CMDS-20170001_1-OS'
lexicon = LexiconDict(lexiconpath)
tokener = SentenceToken(lexicon)
#result = tokener.parse('除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外'.replace(' ','',100))
#print(result)
for parent,dirnames,filenames in os.walk(work_path):
for filename in filenames:
if filename.endswith('.txt'):
txt_path = os.path.join(parent, filename)
f = open(txt_path,'r')
line = f.readline()
result = tokener.parse(line)
out_path = txt_path.replace('.txt','.wav.trn')
out_f = open(out_path,'w')
out_f.write(result['words'] + '\n')
out_f.write(result['phones'] + '\n')
out_f.write(result['phones'])
#print(result)
f.close()
out_f.close()
这是个基本的代码 采用了字典加最大长度匹配的分词模式 同时支持标准音素
2.同步数据
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.
"""
将ST-CMDS-20170001_1-OS生成 aishell 训练所需要的数据格式
"""
import os
import os.path
import shutil
workpath='./ST-CMDS-20170001_1-OS'
targetpath='{aishell_data}/wav/train'
sentence_f = open('./sentence.txt','w')
for parent,dirs,files in os.walk(workpath):
for filename in files:
if filename.endswith('.wav'):
item = {}
key = filename.replace('.wav','')
path = os.path.join(parent,filename)
item['key'] = key
item['path'] = path
labelpath = os.path.join(parent,filename.replace('.wav','.wav.trn'))
f = open(labelpath,'r')
item['sentence'] = f.readline().replace("\n",'')
item['pinyin'] = f.readline().replace("\n",'')
f.close()
sentence_f.write(key + " " + item['sentence'] + '\n')
item['spk'] = filename[7:14]
if not os.path.exists(targetpath+"/" + item['spk']):
os.makedirs(targetpath+"/" + item['spk'])
shutil.copy(path, targetpath+"/" + item['spk'] + '/'+ filename)
sentence_f.close()
说白了就是复制文件 生成标注文本
3.另一个数据源的同步
import json
import os
import os.path
import shutil
from sentence_mark import LexiconDict,SentenceToken
lexiconpath='{aishell_datapath}/resource_aishell/lexicon.txt'
lexicon = LexiconDict(lexiconpath)
tokener = SentenceToken(lexicon)
source_dir='./primewords_md_2018_set1'
targetpath='{aishell_datapath}/data_aishell/wav/train'
sentence_f = open('{aishell_datapath}/data_aishell/transcript/sentence_primeword.txt','w')
json_f=open(source_dir + '/set1_transcript.json','r')
sentence_list=json.load(json_f)
count = 0;
for item in sentence_list:
filename = item['file']
userid = item['user_id']
fileId = 'PW' + ("%06d" % int(userid)) + "%08d" % int(item['id'])
text = item['text']
result = tokener.parse(text.replace(' ','',100))
print(result['words'])
audio_path=source_dir + '/audio_files/' + filename[0:1] + '/' + filename[0:2] + '/' + filename;
print(fileId)
count = count + 1
sentence_f.write(fileId + " " + result['words'] + '\n')
p_path = targetpath+"/TPW" + ("%06d" % int(userid))
if not os.path.exists(p_path):
os.makedirs(p_path)
shutil.copy(audio_path, p_path + "/" + fileId+'.wav')
4. 合并标注的文本跟aishell_transcript_v0.8.txt
追加上面生成的标注
结语
不想说太多 就贴代码了 哈哈哈 数据集有近400个小时的了 训练看看