Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
README_REPO.md
CHANGED
|
@@ -112,7 +112,7 @@ docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,targ
|
|
| 112 |
Deployment solution with Triton and TensorRT-LLM.
|
| 113 |
|
| 114 |
#### Benchmark Results
|
| 115 |
-
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs.
|
| 116 |
|
| 117 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
| 118 |
|---------------------|----------------|-------------|--------|-----------------|
|
|
|
|
| 112 |
Deployment solution with Triton and TensorRT-LLM.
|
| 113 |
|
| 114 |
#### Benchmark Results
|
| 115 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
|
| 116 |
|
| 117 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
| 118 |
|---------------------|----------------|-------------|--------|-----------------|
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
-
version = "1.1.
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "f5-tts"
|
| 7 |
+
version = "1.1.2"
|
| 8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
readme = "README.md"
|
| 10 |
license = {text = "MIT License"}
|
src/f5_tts/runtime/triton_trtllm/README.md
CHANGED
|
@@ -57,7 +57,7 @@ benchmark.py --output-dir $log_dir \
|
|
| 57 |
```
|
| 58 |
|
| 59 |
### Benchmark Results
|
| 60 |
-
Decoding on a single L20 GPU, using 26 different prompt_audio
|
| 61 |
|
| 62 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
| 63 |
|---------------------|----------------|-------------|--------|-----------------|
|
|
|
|
| 57 |
```
|
| 58 |
|
| 59 |
### Benchmark Results
|
| 60 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
|
| 61 |
|
| 62 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
| 63 |
|---------------------|----------------|-------------|--------|-----------------|
|
src/f5_tts/runtime/triton_trtllm/benchmark.py
CHANGED
|
@@ -168,7 +168,9 @@ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
|
|
| 168 |
ref_mel_list.append(ref_mel)
|
| 169 |
ref_mel_len_list.append(ref_mel_len)
|
| 170 |
|
| 171 |
-
estimated_reference_target_mel_len.append(
|
|
|
|
|
|
|
| 172 |
|
| 173 |
max_seq_len = max(estimated_reference_target_mel_len)
|
| 174 |
ref_mel_batch = padded_mel_batch(ref_mel_list, max_seq_len)
|
|
|
|
| 168 |
ref_mel_list.append(ref_mel)
|
| 169 |
ref_mel_len_list.append(ref_mel_len)
|
| 170 |
|
| 171 |
+
estimated_reference_target_mel_len.append(
|
| 172 |
+
int(ref_mel.shape[0] * (1 + len(target_text.encode("utf-8")) / len(prompt_text.encode("utf-8"))))
|
| 173 |
+
)
|
| 174 |
|
| 175 |
max_seq_len = max(estimated_reference_target_mel_len)
|
| 176 |
ref_mel_batch = padded_mel_batch(ref_mel_list, max_seq_len)
|
src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
CHANGED
|
@@ -219,7 +219,9 @@ class TritonPythonModel:
|
|
| 219 |
|
| 220 |
reference_mel_len.append(mel_features.shape[1])
|
| 221 |
estimated_reference_target_mel_len.append(
|
| 222 |
-
int(
|
|
|
|
|
|
|
| 223 |
)
|
| 224 |
|
| 225 |
max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
|
|
|
|
| 219 |
|
| 220 |
reference_mel_len.append(mel_features.shape[1])
|
| 221 |
estimated_reference_target_mel_len.append(
|
| 222 |
+
int(
|
| 223 |
+
mel_features.shape[1] * (1 + len(target_text.encode("utf-8")) / len(reference_text.encode("utf-8")))
|
| 224 |
+
)
|
| 225 |
)
|
| 226 |
|
| 227 |
max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
|