aishell 拓展数据训练

kaldi 上的aishell s5的例子 有含178小时的训练数据，作出相应的修改是可以运行出模型来的，但是识别效果不是那么好，
所以做了些工作来拓展训练的数据 

aishell需要的数据格式

1 音频

根据说话者建目录同一个录音的人士把他的音频文件都是放在同一个目录下的

2 标注

aishell 的标注比较简单都写在transcript/aishell_transcript_v0.8.txt里了格式 filename(不含后缀)空格加上分好词的句子

其他数据源

也是来自openslr

Free ST Chinese Mandarin CorpusSLR38

Primewords Chinese Corpus Set 1SLR47

准备

1 分词代码:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.

"""
convert sentece to wrods and mark pingyin
"""


import os
import os.path


class LexiconDict:
    def __init__(self,path):
        self.dict = {}
        with open(path,'r') as f:
            for line in f:
                line = line.replace('\n','')
                words = line.split(' ')
                self.dict[words[0]] = ' '.join(words[1:])

    def findKey(self, key):
        return key in self.dict

    def markerForKey(self,key):
        return self.dict[key]





class SentenceToken:

    def __init__(self, lexicon):
        self.lexicon = lexicon

    def cut(self, sentence,result):
        offset = 1
        max_len = len(sentence)
        while(self.lexicon.findKey(sentence[:offset])):
            offset = offset + 1
            if offset > max_len:
                break;
        sub = sentence[:offset-1]
        if len(sub) > 0:
            result['words'].append(sub)
            result['phones'].append(self.lexicon.markerForKey(sub))
        else:
            return
        #print(sentence[:offset-1])
        if offset > max_len:
            return
        self.cut(sentence[offset-1:],result)

    def parse(self,sentence):
        result = {};
        result['words'] = []
        result['phones'] = []        
        self.cut(sentence,result)
        result['words'] = ' '.join(result['words'])
        result['phones'] = ' '.join(result['phones'])
        return result
        

if(__name__=='__main__'):
    lexiconpath = '{aishell_data}/lexicon.txt'
    work_path = './ST-CMDS-20170001_1-OS'
    lexicon = LexiconDict(lexiconpath)
    tokener = SentenceToken(lexicon)
    #result = tokener.parse('除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外'.replace(' ','',100))
    #print(result)
    for parent,dirnames,filenames in os.walk(work_path):
        for filename in filenames:
            if filename.endswith('.txt'):
                txt_path = os.path.join(parent, filename)
                f = open(txt_path,'r')
                line = f.readline()
                result = tokener.parse(line)
                out_path = txt_path.replace('.txt','.wav.trn')
                out_f = open(out_path,'w')
                out_f.write(result['words'] + '\n')
                out_f.write(result['phones'] + '\n')
                out_f.write(result['phones'])
                #print(result)
                f.close()
                out_f.close()

这是个基本的代码采用了字典加最大长度匹配的分词模式同时支持标准音素

2.同步数据

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2018 xiaominfc <xiaominfc@xiaominfc-MBP.local>
#
# Distributed under terms of the MIT license.

"""
将ST-CMDS-20170001_1-OS生成 aishell 训练所需要的数据格式
"""

import os
import os.path
import shutil
workpath='./ST-CMDS-20170001_1-OS'
targetpath='{aishell_data}/wav/train'
sentence_f = open('./sentence.txt','w')
for parent,dirs,files in os.walk(workpath):
    for filename in files:
        if filename.endswith('.wav'):
            item = {}
            key = filename.replace('.wav','')
            path = os.path.join(parent,filename)
            item['key'] = key
            item['path'] = path
            labelpath = os.path.join(parent,filename.replace('.wav','.wav.trn'))
            f = open(labelpath,'r')
            item['sentence'] = f.readline().replace("\n",'')
            item['pinyin'] = f.readline().replace("\n",'')
            f.close()
            sentence_f.write(key + " " + item['sentence'] + '\n')
            item['spk'] = filename[7:14] 
            if not os.path.exists(targetpath+"/" + item['spk']):
                os.makedirs(targetpath+"/" + item['spk'])
            shutil.copy(path, targetpath+"/" + item['spk'] + '/'+ filename)



sentence_f.close()

说白了就是复制文件生成标注文本

3.另一个数据源的同步

import json
import os
import os.path
import shutil

from sentence_mark import LexiconDict,SentenceToken 



lexiconpath='{aishell_datapath}/resource_aishell/lexicon.txt'

lexicon = LexiconDict(lexiconpath)
tokener = SentenceToken(lexicon)


source_dir='./primewords_md_2018_set1'
targetpath='{aishell_datapath}/data_aishell/wav/train'
sentence_f = open('{aishell_datapath}/data_aishell/transcript/sentence_primeword.txt','w')
json_f=open(source_dir + '/set1_transcript.json','r')
sentence_list=json.load(json_f)

count = 0;
for item in sentence_list:
    filename = item['file']
    userid = item['user_id']
    fileId = 'PW' + ("%06d" % int(userid)) +   "%08d" % int(item['id'])
    text = item['text']
    result = tokener.parse(text.replace(' ','',100)) 
    print(result['words'])  
    audio_path=source_dir + '/audio_files/' + filename[0:1] + '/' + filename[0:2] + '/' + filename;
    print(fileId)
    count = count + 1
    sentence_f.write(fileId + " " + result['words'] + '\n')
    p_path = targetpath+"/TPW" +  ("%06d" % int(userid))
    if not os.path.exists(p_path):
        os.makedirs(p_path)
    shutil.copy(audio_path, p_path  + "/" +  fileId+'.wav')

4. 合并标注的文本跟aishell_transcript_v0.8.txt

追加上面生成的标注

结语

不想说太多就贴代码了哈哈哈数据集有近400个小时的了训练看看