Spaces:
Running
Running
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- README_REPO.md +7 -1
- app.py +1 -2
- inference-cli.py +35 -21
- model/utils.py +6 -7
- requirements.txt +2 -8
- requirements_eval.txt +5 -0
README_REPO.md
CHANGED
|
@@ -62,7 +62,7 @@ An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discuss
|
|
| 62 |
|
| 63 |
## Inference
|
| 64 |
|
| 65 |
-
|
| 66 |
|
| 67 |
Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
|
| 68 |
- To avoid possible inference failures, make sure you have seen through the following instructions.
|
|
@@ -148,6 +148,12 @@ bash scripts/eval_infer_batch.sh
|
|
| 148 |
|
| 149 |
### Objective Evaluation
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
**Some Notes**
|
| 152 |
|
| 153 |
For faster-whisper with CUDA 11:
|
|
|
|
| 62 |
|
| 63 |
## Inference
|
| 64 |
|
| 65 |
+
The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [⭐ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or automatically downloaded with `inference-cli` and `gradio_app`.
|
| 66 |
|
| 67 |
Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
|
| 68 |
- To avoid possible inference failures, make sure you have seen through the following instructions.
|
|
|
|
| 148 |
|
| 149 |
### Objective Evaluation
|
| 150 |
|
| 151 |
+
Install packages for evaluation:
|
| 152 |
+
|
| 153 |
+
```bash
|
| 154 |
+
pip install -r requirements_eval.txt
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
**Some Notes**
|
| 158 |
|
| 159 |
For faster-whisper with CUDA 11:
|
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import os
|
| 2 |
import re
|
| 3 |
import torch
|
| 4 |
import torchaudio
|
|
@@ -17,7 +16,6 @@ from model.utils import (
|
|
| 17 |
save_spectrogram,
|
| 18 |
)
|
| 19 |
from transformers import pipeline
|
| 20 |
-
import librosa
|
| 21 |
import click
|
| 22 |
import soundfile as sf
|
| 23 |
|
|
@@ -429,6 +427,7 @@ with gr.Blocks() as app_credits:
|
|
| 429 |
|
| 430 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
| 431 |
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
|
|
|
| 432 |
""")
|
| 433 |
with gr.Blocks() as app_tts:
|
| 434 |
gr.Markdown("# Batched TTS")
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
|
|
|
| 16 |
save_spectrogram,
|
| 17 |
)
|
| 18 |
from transformers import pipeline
|
|
|
|
| 19 |
import click
|
| 20 |
import soundfile as sf
|
| 21 |
|
|
|
|
| 427 |
|
| 428 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
| 429 |
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
| 430 |
+
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
|
| 431 |
""")
|
| 432 |
with gr.Blocks() as app_tts:
|
| 433 |
gr.Markdown("# Batched TTS")
|
inference-cli.py
CHANGED
|
@@ -1,26 +1,24 @@
|
|
|
|
|
|
|
|
| 1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
| 4 |
-
import
|
| 5 |
-
import
|
| 6 |
from einops import rearrange
|
| 7 |
-
from vocos import Vocos
|
| 8 |
from pydub import AudioSegment, silence
|
| 9 |
-
from model import CFM, UNetT, DiT, MMDiT
|
| 10 |
-
from cached_path import cached_path
|
| 11 |
-
from model.utils import (
|
| 12 |
-
load_checkpoint,
|
| 13 |
-
get_tokenizer,
|
| 14 |
-
convert_char_to_pinyin,
|
| 15 |
-
save_spectrogram,
|
| 16 |
-
)
|
| 17 |
from transformers import pipeline
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
import
|
| 21 |
-
import
|
| 22 |
-
|
| 23 |
-
import codecs
|
| 24 |
|
| 25 |
parser = argparse.ArgumentParser(
|
| 26 |
prog="python3 inference-cli.py",
|
|
@@ -73,6 +71,11 @@ parser.add_argument(
|
|
| 73 |
"--remove_silence",
|
| 74 |
help="Remove silence.",
|
| 75 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
args = parser.parse_args()
|
| 77 |
|
| 78 |
config = tomli.load(open(args.config, "rb"))
|
|
@@ -88,6 +91,7 @@ model = args.model if args.model else config["model"]
|
|
| 88 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
| 89 |
wave_path = Path(output_dir)/"out.wav"
|
| 90 |
spectrogram_path = Path(output_dir)/"out.png"
|
|
|
|
| 91 |
|
| 92 |
SPLIT_WORDS = [
|
| 93 |
"but", "however", "nevertheless", "yet", "still",
|
|
@@ -105,7 +109,16 @@ device = (
|
|
| 105 |
if torch.cuda.is_available()
|
| 106 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
| 107 |
)
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
print(f"Using {device} device")
|
| 111 |
|
|
@@ -124,8 +137,9 @@ speed = 1.0
|
|
| 124 |
fix_duration = None
|
| 125 |
|
| 126 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 127 |
-
ckpt_path =
|
| 128 |
-
|
|
|
|
| 129 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
| 130 |
model = CFM(
|
| 131 |
transformer=model_cls(
|
|
@@ -385,4 +399,4 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
|
|
| 385 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
|
| 386 |
|
| 387 |
|
| 388 |
-
infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import codecs
|
| 3 |
import re
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
import tomli
|
| 10 |
import torch
|
| 11 |
import torchaudio
|
| 12 |
+
import tqdm
|
| 13 |
+
from cached_path import cached_path
|
| 14 |
from einops import rearrange
|
|
|
|
| 15 |
from pydub import AudioSegment, silence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from transformers import pipeline
|
| 17 |
+
from vocos import Vocos
|
| 18 |
+
|
| 19 |
+
from model import CFM, DiT, MMDiT, UNetT
|
| 20 |
+
from model.utils import (convert_char_to_pinyin, get_tokenizer,
|
| 21 |
+
load_checkpoint, save_spectrogram)
|
|
|
|
| 22 |
|
| 23 |
parser = argparse.ArgumentParser(
|
| 24 |
prog="python3 inference-cli.py",
|
|
|
|
| 71 |
"--remove_silence",
|
| 72 |
help="Remove silence.",
|
| 73 |
)
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
"--load_vocoder_from_local",
|
| 76 |
+
action="store_true",
|
| 77 |
+
help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
|
| 78 |
+
)
|
| 79 |
args = parser.parse_args()
|
| 80 |
|
| 81 |
config = tomli.load(open(args.config, "rb"))
|
|
|
|
| 91 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
| 92 |
wave_path = Path(output_dir)/"out.wav"
|
| 93 |
spectrogram_path = Path(output_dir)/"out.png"
|
| 94 |
+
vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
|
| 95 |
|
| 96 |
SPLIT_WORDS = [
|
| 97 |
"but", "however", "nevertheless", "yet", "still",
|
|
|
|
| 109 |
if torch.cuda.is_available()
|
| 110 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
| 111 |
)
|
| 112 |
+
|
| 113 |
+
if args.load_vocoder_from_local:
|
| 114 |
+
print(f"Load vocos from local path {vocos_local_path}")
|
| 115 |
+
vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
|
| 116 |
+
state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", map_location=device)
|
| 117 |
+
vocos.load_state_dict(state_dict)
|
| 118 |
+
vocos.eval()
|
| 119 |
+
else:
|
| 120 |
+
print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
|
| 121 |
+
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
| 122 |
|
| 123 |
print(f"Using {device} device")
|
| 124 |
|
|
|
|
| 137 |
fix_duration = None
|
| 138 |
|
| 139 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 140 |
+
ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
| 141 |
+
if not Path(ckpt_path).exists():
|
| 142 |
+
ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
|
| 143 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
| 144 |
model = CFM(
|
| 145 |
transformer=model_cls(
|
|
|
|
| 399 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
|
| 400 |
|
| 401 |
|
| 402 |
+
infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
|
model/utils.py
CHANGED
|
@@ -22,12 +22,6 @@ from einops import rearrange, reduce
|
|
| 22 |
|
| 23 |
import jieba
|
| 24 |
from pypinyin import lazy_pinyin, Style
|
| 25 |
-
import zhconv
|
| 26 |
-
from zhon.hanzi import punctuation
|
| 27 |
-
from jiwer import compute_measures
|
| 28 |
-
|
| 29 |
-
from funasr import AutoModel
|
| 30 |
-
from faster_whisper import WhisperModel
|
| 31 |
|
| 32 |
from model.ecapa_tdnn import ECAPA_TDNN_SMALL
|
| 33 |
from model.modules import MelSpec
|
|
@@ -432,6 +426,7 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
|
|
| 432 |
|
| 433 |
def load_asr_model(lang, ckpt_dir = ""):
|
| 434 |
if lang == "zh":
|
|
|
|
| 435 |
model = AutoModel(
|
| 436 |
model = os.path.join(ckpt_dir, "paraformer-zh"),
|
| 437 |
# vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
|
|
@@ -440,6 +435,7 @@ def load_asr_model(lang, ckpt_dir = ""):
|
|
| 440 |
disable_update=True,
|
| 441 |
) # following seed-tts setting
|
| 442 |
elif lang == "en":
|
|
|
|
| 443 |
model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
|
| 444 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 445 |
return model
|
|
@@ -451,6 +447,7 @@ def run_asr_wer(args):
|
|
| 451 |
rank, lang, test_set, ckpt_dir = args
|
| 452 |
|
| 453 |
if lang == "zh":
|
|
|
|
| 454 |
torch.cuda.set_device(rank)
|
| 455 |
elif lang == "en":
|
| 456 |
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
|
|
@@ -458,10 +455,12 @@ def run_asr_wer(args):
|
|
| 458 |
raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
|
| 459 |
|
| 460 |
asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
|
| 461 |
-
|
|
|
|
| 462 |
punctuation_all = punctuation + string.punctuation
|
| 463 |
wers = []
|
| 464 |
|
|
|
|
| 465 |
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
| 466 |
if lang == "zh":
|
| 467 |
res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
|
|
|
|
| 22 |
|
| 23 |
import jieba
|
| 24 |
from pypinyin import lazy_pinyin, Style
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
from model.ecapa_tdnn import ECAPA_TDNN_SMALL
|
| 27 |
from model.modules import MelSpec
|
|
|
|
| 426 |
|
| 427 |
def load_asr_model(lang, ckpt_dir = ""):
|
| 428 |
if lang == "zh":
|
| 429 |
+
from funasr import AutoModel
|
| 430 |
model = AutoModel(
|
| 431 |
model = os.path.join(ckpt_dir, "paraformer-zh"),
|
| 432 |
# vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
|
|
|
|
| 435 |
disable_update=True,
|
| 436 |
) # following seed-tts setting
|
| 437 |
elif lang == "en":
|
| 438 |
+
from faster_whisper import WhisperModel
|
| 439 |
model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
|
| 440 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 441 |
return model
|
|
|
|
| 447 |
rank, lang, test_set, ckpt_dir = args
|
| 448 |
|
| 449 |
if lang == "zh":
|
| 450 |
+
import zhconv
|
| 451 |
torch.cuda.set_device(rank)
|
| 452 |
elif lang == "en":
|
| 453 |
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
|
|
|
|
| 455 |
raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
|
| 456 |
|
| 457 |
asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
|
| 458 |
+
|
| 459 |
+
from zhon.hanzi import punctuation
|
| 460 |
punctuation_all = punctuation + string.punctuation
|
| 461 |
wers = []
|
| 462 |
|
| 463 |
+
from jiwer import compute_measures
|
| 464 |
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
| 465 |
if lang == "zh":
|
| 466 |
res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
|
requirements.txt
CHANGED
|
@@ -5,25 +5,19 @@ datasets
|
|
| 5 |
einops>=0.8.0
|
| 6 |
einx>=0.3.0
|
| 7 |
ema_pytorch>=0.5.2
|
| 8 |
-
faster_whisper
|
| 9 |
-
funasr
|
| 10 |
gradio
|
| 11 |
jieba
|
| 12 |
-
jiwer
|
| 13 |
librosa
|
| 14 |
matplotlib
|
| 15 |
-
numpy
|
| 16 |
pydub
|
| 17 |
pypinyin
|
| 18 |
safetensors
|
| 19 |
soundfile
|
| 20 |
-
|
| 21 |
-
# torchaudio>=2.3.0
|
| 22 |
torchdiffeq
|
| 23 |
tqdm>=4.65.0
|
| 24 |
transformers
|
| 25 |
vocos
|
| 26 |
wandb
|
| 27 |
x_transformers>=1.31.14
|
| 28 |
-
zhconv
|
| 29 |
-
zhon
|
|
|
|
| 5 |
einops>=0.8.0
|
| 6 |
einx>=0.3.0
|
| 7 |
ema_pytorch>=0.5.2
|
|
|
|
|
|
|
| 8 |
gradio
|
| 9 |
jieba
|
|
|
|
| 10 |
librosa
|
| 11 |
matplotlib
|
| 12 |
+
numpy<=1.26.4
|
| 13 |
pydub
|
| 14 |
pypinyin
|
| 15 |
safetensors
|
| 16 |
soundfile
|
| 17 |
+
tomli
|
|
|
|
| 18 |
torchdiffeq
|
| 19 |
tqdm>=4.65.0
|
| 20 |
transformers
|
| 21 |
vocos
|
| 22 |
wandb
|
| 23 |
x_transformers>=1.31.14
|
|
|
|
|
|
requirements_eval.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
faster_whisper
|
| 2 |
+
funasr
|
| 3 |
+
jiwer
|
| 4 |
+
zhconv
|
| 5 |
+
zhon
|