Spaces:

akhaliq
/

Real-Time-Voice-Cloning

Runtime error

Real-Time-Voice-Cloning / synthesizer /synthesize.py

akhaliq3

spaces demo

24829a1 about 4 years ago

4.32 kB

	import torch
	from torch.utils.data import DataLoader
	from synthesizer.hparams import hparams_debug_string
	from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
	from synthesizer.models.tacotron import Tacotron
	from synthesizer.utils.text import text_to_sequence
	from synthesizer.utils.symbols import symbols
	import numpy as np
	from pathlib import Path
	from tqdm import tqdm
	import platform

	def run_synthesis(in_dir, out_dir, model_dir, hparams):
	# This generates ground truth-aligned mels for vocoder training
	synth_dir = Path(out_dir).joinpath("mels_gta")
	synth_dir.mkdir(exist_ok=True)
	print(hparams_debug_string())

	# Check for GPU
	if torch.cuda.is_available():
	device = torch.device("cuda")
	if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
	raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
	else:
	device = torch.device("cpu")
	print("Synthesizer using device:", device)

	# Instantiate Tacotron model
	model = Tacotron(embed_dims=hparams.tts_embed_dims,
	num_chars=len(symbols),
	encoder_dims=hparams.tts_encoder_dims,
	decoder_dims=hparams.tts_decoder_dims,
	n_mels=hparams.num_mels,
	fft_bins=hparams.num_mels,
	postnet_dims=hparams.tts_postnet_dims,
	encoder_K=hparams.tts_encoder_K,
	lstm_dims=hparams.tts_lstm_dims,
	postnet_K=hparams.tts_postnet_K,
	num_highways=hparams.tts_num_highways,
	dropout=0., # Use zero dropout for gta mels
	stop_threshold=hparams.tts_stop_threshold,
	speaker_embedding_size=hparams.speaker_embedding_size).to(device)

	# Load the weights
	model_dir = Path(model_dir)
	model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt")
	print("\nLoading weights at %s" % model_fpath)
	model.load(model_fpath)
	print("Tacotron weights loaded from step %d" % model.step)

	# Synthesize using same reduction factor as the model is currently trained
	r = np.int32(model.r)

	# Set model to eval mode (disable gradient and zoneout)
	model.eval()

	# Initialize the dataset
	in_dir = Path(in_dir)
	metadata_fpath = in_dir.joinpath("train.txt")
	mel_dir = in_dir.joinpath("mels")
	embed_dir = in_dir.joinpath("embeds")

	dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
	data_loader = DataLoader(dataset,
	collate_fn=lambda batch: collate_synthesizer(batch, r, hparams),
	batch_size=hparams.synthesis_batch_size,
	num_workers=2 if platform.system() != "Windows" else 0,
	shuffle=False,
	pin_memory=True)

	# Generate GTA mels
	meta_out_fpath = Path(out_dir).joinpath("synthesized.txt")
	with open(meta_out_fpath, "w") as file:
	for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
	texts = texts.to(device)
	mels = mels.to(device)
	embeds = embeds.to(device)

	# Parallelize model onto GPUS using workaround due to python bug
	if device.type == "cuda" and torch.cuda.device_count() > 1:
	_, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
	else:
	_, mels_out, _, _ = model(texts, mels, embeds)

	for j, k in enumerate(idx):
	# Note: outputs mel-spectrogram files and target ones have same names, just different folders
	mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
	mel_out = mels_out[j].detach().cpu().numpy().T

	# Use the length of the ground truth mel to remove padding from the generated mels
	mel_out = mel_out[:int(dataset.metadata[k][4])]

	# Write the spectrogram to disk
	np.save(mel_filename, mel_out, allow_pickle=False)

	# Write metadata into the synthesized file
	file.write("\|".join(dataset.metadata[k]))