aishell 拓展数据训练

aishell 拓展数据训练

kaldi 上的aishell s5的例子 有含178小时的训练数据,作出相应的修改是可以运行出模型来的,但是识别效果不是那么好,
所以做了些工作来拓展训练的数据 

aishell需要的数据格式

1 音频

根据说话者建目录 同一个录音的人士把他的音频文件都是放在同一个目录下的

2 标注

aishell 的标注比较简单 都写在transcript/aishell_transcript_v0.8.txt里了 格式 filename(不含后缀)空格加上分好词的句子

其他数据源

也是来自openslr

Free ST Chinese Mandarin CorpusSLR38

Primewords Chinese Corpus Set 1SLR47

准备

1 分词代码:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.

"""
convert sentece to wrods and mark pingyin
"""


import os
import os.path


class LexiconDict:
    def __init__(self,path):
        self.dict = {}
        with open(path,'r') as f:
            for line in f:
                line = line.replace('\n','')
                words = line.split(' ')
                self.dict[words[0]] = ' '.join(words[1:])

    def findKey(self, key):
        return key in self.dict

    def markerForKey(self,key):
        return self.dict[key]





class SentenceToken:

    def __init__(self, lexicon):
        self.lexicon = lexicon

    def cut(self, sentence,result):
        offset = 1
        max_len = len(sentence)
        while(self.lexicon.findKey(sentence[:offset])):
            offset = offset + 1
            if offset > max_len:
                break;
        sub = sentence[:offset-1]
        if len(sub) > 0:
            result['words'].append(sub)
            result['phones'].append(self.lexicon.markerForKey(sub))
        else:
            return
        #print(sentence[:offset-1])
        if offset > max_len:
            return
        self.cut(sentence[offset-1:],result)

    def parse(self,sentence):
        result = {};
        result['words'] = []
        result['phones'] = []        
        self.cut(sentence,result)
        result['words'] = ' '.join(result['words'])
        result['phones'] = ' '.join(result['phones'])
        return result
        

if(__name__=='__main__'):
    lexiconpath = '{aishell_data}/lexicon.txt'
    work_path = './ST-CMDS-20170001_1-OS'
    lexicon = LexiconDict(lexiconpath)
    tokener = SentenceToken(lexicon)
    #result = tokener.parse('除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外'.replace(' ','',100))
    #print(result)
    for parent,dirnames,filenames in os.walk(work_path):
        for filename in filenames:
            if filename.endswith('.txt'):
                txt_path = os.path.join(parent, filename)
                f = open(txt_path,'r')
                line = f.readline()
                result = tokener.parse(line)
                out_path = txt_path.replace('.txt','.wav.trn')
                out_f = open(out_path,'w')
                out_f.write(result['words'] + '\n')
                out_f.write(result['phones'] + '\n')
                out_f.write(result['phones'])
                #print(result)
                f.close()
                out_f.close()




这是个基本的代码 采用了字典加最大长度匹配的分词模式 同时支持标准音素

2.同步数据

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.

"""
将ST-CMDS-20170001_1-OS生成 aishell 训练所需要的数据格式
"""

import os
import os.path
import shutil
workpath='./ST-CMDS-20170001_1-OS'
targetpath='{aishell_data}/wav/train'
sentence_f = open('./sentence.txt','w')
for parent,dirs,files in os.walk(workpath):
    for filename in files:
        if filename.endswith('.wav'):
            item = {}
            key = filename.replace('.wav','')
            path = os.path.join(parent,filename)
            item['key'] = key
            item['path'] = path
            labelpath = os.path.join(parent,filename.replace('.wav','.wav.trn'))
            f = open(labelpath,'r')
            item['sentence'] = f.readline().replace("\n",'')
            item['pinyin'] = f.readline().replace("\n",'')
            f.close()
            sentence_f.write(key + " " + item['sentence'] + '\n')
            item['spk'] = filename[7:14] 
            if not os.path.exists(targetpath+"/" + item['spk']):
                os.makedirs(targetpath+"/" + item['spk'])
            shutil.copy(path, targetpath+"/" + item['spk'] + '/'+ filename)



sentence_f.close()

说白了就是复制文件 生成标注文本

3.另一个数据源的同步


import json
import os
import os.path
import shutil

from sentence_mark import LexiconDict,SentenceToken 



lexiconpath='{aishell_datapath}/resource_aishell/lexicon.txt'

lexicon = LexiconDict(lexiconpath)
tokener = SentenceToken(lexicon)


source_dir='./primewords_md_2018_set1'
targetpath='{aishell_datapath}/data_aishell/wav/train'
sentence_f = open('{aishell_datapath}/data_aishell/transcript/sentence_primeword.txt','w')
json_f=open(source_dir + '/set1_transcript.json','r')
sentence_list=json.load(json_f)

count = 0;
for item in sentence_list:
    filename = item['file']
    userid = item['user_id']
    fileId = 'PW' + ("%06d" % int(userid)) +   "%08d" % int(item['id'])
    text = item['text']
    result = tokener.parse(text.replace(' ','',100)) 
    print(result['words'])  
    audio_path=source_dir + '/audio_files/' + filename[0:1] + '/' + filename[0:2] + '/' + filename;
    print(fileId)
    count = count + 1
    sentence_f.write(fileId + " " + result['words'] + '\n')
    p_path = targetpath+"/TPW" +  ("%06d" % int(userid))
    if not os.path.exists(p_path):
        os.makedirs(p_path)
    shutil.copy(audio_path, p_path  + "/" +  fileId+'.wav')


4. 合并标注的文本跟aishell_transcript_v0.8.txt

追加上面生成的标注

结语

不想说太多 就贴代码了 哈哈哈 数据集有近400个小时的了 训练看看

ps 效果出来了 比之前的错误率少了0.5% 好吧不太明显 有待加大语音数据训练


阅读量: