mkdir tensorflowtts
nvidia-docker run --name tensorflowtts -p 7000:7000 -p 7001:7001 -v /home/root/tensorflowtts:/workspace -it langjingxiang/tensorflowtts:latest
Copy the code
docker exec -it containerID bash
Copy the code
import tensorflow as tf import yaml import numpy as np import matplotlib.pyplot as plt import IPython.display as ipd import librosa import soundfile from tensorflow_tts.inference import AutoConfig from tensorflow_tts.inference import TFAutoModel from tensorflow_tts.inference import AutoProcessor import os os.environ['CUDA_VISIBLE_DEVICES'] = '/device:GPU:0' tacotron2_config = AutoConfig.from_pretrained('./examples/tacotron2/conf/tacotron2.baker.v1.yaml') tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path="tacotron2-100k.h5", name="tacotron2" ) fastspeech2_config = AutoConfig.from_pretrained('./examples/fastspeech2/conf/fastspeech2.baker.v2.yaml') fastspeech2 = TFAutoModel.from_pretrained( config=fastspeech2_config, pretrained_path="fastspeech2-200k.h5", name="fastspeech2" ) mb_melgan_config = AutoConfig.from_pretrained('./examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml') mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path="mb.melgan-920k.h5", name="mb_melgan" ) processor = AutoProcessor.from_pretrained(pretrained_path="./baker_mapper.json") def do_synthesis(input_text, text2mel_name): input_ids = processor.text_to_sequence(input_text, inference=True) if text2mel_name == "tacotron2": _, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32) ) remove_end = 1024 elif text2mel_name == "fastspeech2": mel_before, mel_outputs, duration_outputs, _, _ = fastspeech2.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, Dtype =tf.int32), 0), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), speed_ratios=tf.convert_to_tensor([1.0], Dtype = tf. Float32), f0_ratios = tf. Convert_to_tensor ([1.0], dtype = tf. Float32), energy_ratios = tf. Convert_to_tensor ([1.0]. dtype=tf.float32), ) remove_end = 1 else: raise ValueError("Only TACOTRON, FASTSPEECH2 are supported on text2mel_name") audio = mb_melgan.inference(mel_outputs)[0, :-remove_end, 0] return audio.numpy() from flask import Flask,request,jsonify import json import uuid app = Flask(__name__) @app.route('/') def hello_world(): return 'Hello, World! ' @app.route('/api',methods=['POST']) def text2wav(): data = json.loads(request.get_data(as_text=True)) text = data['text'] mode_name = data['mode_name'] id = uuid.uuid4() audios = do_synthesis(text, mode_name) sr=24000 soundfile.write("./static/{0}.wav".format(id), audios, The sr) return "http://172.31.18.90:7001/static/ {0}. Wav". The format (id) if __name__ = = "__main__ ': App. The run (port = 7001, host = '0.0.0.0')Copy the code
{"text":" Hello ", "mode":"FASTSPEECH2"}Copy the code