基于Tacotron模型实现中文语音合成

#基于Tacotron模型实现中文语音合成

1.数据准备

1.1 单个女生语音数据10000句

# https://www.data-baker.com/open_source.html
mkdir tts_tacotron
cd tts_tacotron
wget "https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar"
# 解压之

./BZNSYP
├── PhoneLabeling
├── ProsodyLabeling
└── Wave

1.1.1 标注(ProsodyLabeling/000001-010000.txt)

head ProsodyLabeling/000001-010000.txt

000001	卡尔普#2陪外孙#1玩滑梯#4。
	ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002	假语村言#2别再#1拥抱我#4。
	jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003	宝马#1配挂#1跛骡鞍#3，貂蝉#1怨枕#2董翁榻#4。
	bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004	邓小平#2与#1撒切尔#2会晤#4。
	deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005	老虎#1幼崽#2与#1宠物犬#1玩耍#4。
	lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3

1.1.2 语音(Wave/*.wav)

ls Wave | head
000001.wav
000002.wav
000003.wav
000004.wav
000005.wav
000006.wav
000007.wav
000008.wav
000009.wav
000010.wav

1.2 中文字典素材的准备

# DaCiDian (https://github.com/aishell-foundation/DaCiDian)
这是我在kaldi项目接触到的 一个比较不错的词典 用于分词以及拼音的标注
这里不说准备的过程了 到时候直接用转化好的

2 模型的准备

# tocotron模型的实现(https://github.com/keithito/tacotron)
git clone https://github.com/keithito/tacotron.git

# 这是个默认合成英文的模型，有空可以试试，效果还行作者也提供了训练好的模型测试

2.1 关于中文合成的相关修改

2.1.1 输入特征的相关准备

我们这里的思路是把一个句子 用拼音标注好以后 再通过拼音转声母韵母的字典转成声母韵母的序列

# file: text/symbols.py 追加

_phones_f = open(os.path.split(os.path.realpath(__file__))[0] + '/zh_lang/phones','r');
_phones = []

_line = _phones_f.readline()
while _line:
    _line = _line.strip();
    _phones.append(_line)
    _line = _phones_f.readline()

_phones_f.close();
pinyin_symbols = [_pad, _eos] + _phones;
symbols=pinyin_symbols

# 上述文件会在我上传的代码中 这样符号表就建立好了

多写了个文件把分词的语句转拼音做合成的时候要用

# file: text/pinyinconvert.py

"""

"""

import os

wordsDict = {}
_align_f = open(os.path.split(os.path.realpath(__file__))[0] + '/zh_lang/align_lexicon.txt','r')
_line = _align_f.readline()
while _line:
    _line = _line.strip();
    _parts = _line.split('\t')
    if len(_parts) == 2 and _parts[0] not in wordsDict:
        wordsDict[_parts[0]] = _parts[1]
    _line = _align_f.readline()


def sentence_to_pinyin(sentence):
    parts = sentence.split(' ')
    text = ""
    for i in range(0, len(parts)):
        word = parts[i]
        if word in wordsDict:
            if len(text) > 0:
                text = text + ' '
            text = text + wordsDict[word]
    return text

2.2.2 数据集的预处理

在dataset中多加入一种新的数据集处理方式

# file: datasets/bznsyp.py

from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
from util import audio
from text.pinyinconvert import sentence_to_pinyin
from text.symbols import pinyin_dict


# 声母韵母化bznsyp中的拼音 对一些做特殊处理
def _format_pinyin(value):
    if len(value) == 1:
        return value.lower()

    if value == 'IY1':
        return 'i_1'
    key = value[:-1]
    end = value[-1]
    if key in pinyin_dict:
        fpinyin = pinyin_dict[key]
        if end == '5':
            end = '0'
        fpinyin = fpinyin + '_' + end
        return fpinyin
    elif key[-1] == 'r':
        return _format_pinyin(key[:-1] + end) + ' er_0'
    else :
        print('not',value)





# 声母韵母化拼音
def _convet_pinyin(label):
    parts = label.split(' ')
    #print(parts)
    text = ''
    for value in parts:
        if len(text) > 0:
            text = text + ' '
        text = text + _format_pinyin(value)
    return text


def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    #text_dict = _load_dict(os.path.join(in_dir,'text'))
    f = open(os.path.join(in_dir,'ProsodyLabeling/000001-010000.txt'), encoding='utf-8')
    line = f.readline()
    while line:
        parts = line.strip('\n').split('\t')
        line = f.readline()
        name = parts[0]
        label = line.strip()
        text = _convet_pinyin(label)
        wav_path = os.path.join(in_dir,'Wave/' + name + '.wav')
        #print(name,"  ",wav_path);
        futures.append(executor.submit(partial(_process_utterance, out_dir, name, wav_path, text)))
        line = f.readline()
    f.close()
    return [future.result() for future in tqdm(futures)]




def _process_utterance(out_dir, name, wav_path, text):
    wav = audio.load_wav(wav_path)
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'bznsyp-spec-%s.npy' % name
    mel_filename = 'bznsyp-mel-%s.npy' % name
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
    #text = sentence_to_pinyin(text)
    return (spectrogram_filename, mel_filename, n_frames, text)

file: preprocess.py

# 添加处理函数
def preprocess_bznsyp(args):
  in_dir = '/mnt/ml_data/download/BZNSYP' #数据所在路径 写死省事 记得改成你自己的
  out_dir = os.path.join(args.base_dir, args.output)
  os.makedirs(out_dir, exist_ok=True)
  metadata = bznsyp.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
  write_metadata(metadata, out_dir)


# 类似ljspeech 添加相应的到main函数 使之生效

这个样子代码基本就准备完成了然后参考README 执行相应的过程来训练

# 数据预处理

python3 preprocess.py --dataset bznsyp 
# 等进度完成

python3 train.py

# 等!!!!!!

# 或者

nohup  python3 train.py &

# 等!!!!!
# 默认~/tacotron/logs-tacotron 每1000步就会保存一次训练的模型

python3 demo_server.py --checkpoint {model_path}

# 1. 访问http://127.0.0.1:9000 
# 2. 先输入 分好词的中文语句 点击convert 转成声母韵母序列 
# 3. 点击 Speak 等一下 就有结果了(有点麻烦不过是为了有再次矫正拼音的机会)
# 4. 可以听到声音了

感谢你的阅读下面是我改的源码

https://github.com/xiaominfc/tacotron

1.数据准备

1.1 单个女生语音数据10000句

1.1.1 标注(ProsodyLabeling/000001-010000.txt)

1.1.2 语音(Wave/*.wav)

1.2 中文字典素材的准备

2 模型的准备

2.1 关于中文合成的相关修改

2.1.1 输入特征的相关准备

2.2.2 数据集的预处理

感谢你的阅读 下面是我改的源码

感谢你的阅读下面是我改的源码