Duplicate from neuphonic/neutts-air
Browse filesCo-authored-by: Johanna Ulin <[email protected]>
- .gitattributes +44 -0
- README.md +156 -0
- config.json +28 -0
- generation_config.json +14 -0
- model.safetensors +3 -0
- neutss-air-BF16.gguf +3 -0
- neutts-air.png +0 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +3 -0
- vocab.json +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
new_tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
neutts-BF16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
neutts-Q8-0.gguf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
neutts-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
neutss-air-BF16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
neutts-air-Q4-0.gguf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
neutts-air-Q8-0.gguf filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-to-speech
|
| 4 |
+
tags:
|
| 5 |
+
- audio
|
| 6 |
+
- speech
|
| 7 |
+
- speech-language-models
|
| 8 |
+
datasets:
|
| 9 |
+
- amphion/Emilia-Dataset
|
| 10 |
+
- neuphonic/emilia-yodas-english-neucodec
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# NeuTTS Air ☁️
|
| 14 |
+
|
| 15 |
+
[](https://www.youtube.com/watch?v=YAB3hCtu5wE)
|
| 16 |
+
|
| 17 |
+
[🚀 Spaces Demo](https://huggingface.co/spaces/neuphonic/neutts-air), [🔧 Github](https://github.com/neuphonic/neutts-air)
|
| 18 |
+
|
| 19 |
+
[Q8 GGUF version](https://huggingface.co/neuphonic/neutts-air-q8-gguf), [Q4 GGUF version](https://huggingface.co/neuphonic/neutts-air-q4-gguf)
|
| 20 |
+
|
| 21 |
+
*Created by [Neuphonic](http://neuphonic.com/) - building faster, smaller, on-device voice AI*
|
| 22 |
+
|
| 23 |
+
State-of-the-art Voice AI has been locked behind web APIs for too long. NeuTTS Air is the world’s first super-realistic, on-device, TTS speech language model with instant voice cloning. Built off a 0.5B LLM backbone, NeuTTS Air brings natural-sounding speech, real-time performance, built-in security and speaker cloning to your local device - unlocking a new category of embedded voice agents, assistants, toys, and compliance-safe apps.
|
| 24 |
+
|
| 25 |
+
## Key Features
|
| 26 |
+
|
| 27 |
+
- 🗣Best-in-class realism for its size - produces natural, ultra-realistic voices that sound human
|
| 28 |
+
- 📱Optimised for on-device deployment - provided in GGML format, ready to run on phones, laptops, or even Raspberry Pis
|
| 29 |
+
- 👫Instant voice cloning - create your own speaker with as little as 3 seconds of audio
|
| 30 |
+
- 🚄Simple LM + codec architecture built off a 0.5B backbone - the sweet spot between speed, size, and quality for real-world applications
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
> [!CAUTION]
|
| 34 |
+
> Websites like neutts.com are popping up and they're not affliated with Neuphonic, our github or this repo.
|
| 35 |
+
>
|
| 36 |
+
> We are on neuphonic.com only. Please be careful out there! 🙏
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
## Model Details
|
| 40 |
+
|
| 41 |
+
NeuTTS Air is built off Qwen 0.5B - a lightweight yet capable language model optimised for text understanding and generation - as well as a powerful combination of technologies designed for efficiency and quality:
|
| 42 |
+
|
| 43 |
+
- **Audio Codec**: [NeuCodec](https://huggingface.co/neuphonic/neucodec) - our proprietary neural audio codec that achieves exceptional audio quality at low bitrates using a single codebook
|
| 44 |
+
- **Format**: Available in GGML format for efficient on-device inference
|
| 45 |
+
- **Responsibility**: Watermarked outputs
|
| 46 |
+
- **Inference Speed**: Real-time generation on mid-range devices
|
| 47 |
+
- **Power Consumption**: Optimised for mobile and embedded devices
|
| 48 |
+
|
| 49 |
+
## Get Started
|
| 50 |
+
|
| 51 |
+
1. **Clone the [Git Repo](https://github.com/neuphonic/neutts-air)**
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
git clone https://github.com/neuphonic/neutts-air.git
|
| 55 |
+
cd neuttsair
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
2. **Install `espeak` (required dependency)**
|
| 59 |
+
|
| 60 |
+
Please refer to the following link for instructions on how to install `espeak`:
|
| 61 |
+
|
| 62 |
+
https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
# Mac OS
|
| 66 |
+
brew install espeak
|
| 67 |
+
|
| 68 |
+
# Ubuntu/Debian
|
| 69 |
+
sudo apt install espeak
|
| 70 |
+
|
| 71 |
+
# Arch Linux
|
| 72 |
+
paru -S aur/espeak
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
3. **Install Python dependencies**
|
| 76 |
+
|
| 77 |
+
The requirements file includes the dependencies needed to run the model with PyTorch. When using an ONNX decoder or a GGML model, some dependencies (such as PyTorch) are no longer required.
|
| 78 |
+
|
| 79 |
+
The inference is compatible and tested on `python>=3.11`.
|
| 80 |
+
|
| 81 |
+
```
|
| 82 |
+
pip install -r requirements.txt
|
| 83 |
+
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
## **Basic Example**
|
| 88 |
+
|
| 89 |
+
Run the basic example script to synthesize speech:
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
python -m examples.basic_example \
|
| 93 |
+
--input_text "My name is Dave, and um, I'm from London" \
|
| 94 |
+
--ref_audio samples/dave.wav \
|
| 95 |
+
--ref_text samples/dave.txt
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
To specify a particular model repo for the backbone or codec, add the `--backbone` argument. Available backbones are listed in [NeuTTS-Air huggingface collection](https://huggingface.co/collections/neuphonic/neutts-air-68cc14b7033b4c56197ef350).
|
| 100 |
+
|
| 101 |
+
Several examples are available, including a Jupyter notebook in the `examples` folder.
|
| 102 |
+
|
| 103 |
+
### **Simple One-Code Block Usage**
|
| 104 |
+
|
| 105 |
+
```python
|
| 106 |
+
from neuttsair.neutts import NeuTTSAir
|
| 107 |
+
import soundfile as sf
|
| 108 |
+
|
| 109 |
+
tts = NeuTTSAir( backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu")
|
| 110 |
+
input_text = "My name is Dave, and um, I'm from London."
|
| 111 |
+
|
| 112 |
+
ref_text = "samples/dave.txt"
|
| 113 |
+
ref_audio_path = "samples/dave.wav"
|
| 114 |
+
|
| 115 |
+
ref_text = open(ref_text, "r").read().strip()
|
| 116 |
+
ref_codes = tts.encode_reference(ref_audio_path)
|
| 117 |
+
|
| 118 |
+
wav = tts.infer(input_text, ref_codes, ref_text)
|
| 119 |
+
sf.write("test.wav", wav, 24000)
|
| 120 |
+
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
# Tips
|
| 124 |
+
|
| 125 |
+
NeuTTS Air requires two inputs:
|
| 126 |
+
|
| 127 |
+
1. A reference audio sample (`.wav` file)
|
| 128 |
+
2. A text string
|
| 129 |
+
|
| 130 |
+
The model then synthesises the text as speech in the style of the reference audio. This is what enables NeuTTS Air’s instant voice cloning capability.
|
| 131 |
+
|
| 132 |
+
### Example Reference Files
|
| 133 |
+
|
| 134 |
+
You can find some ready-to-use samples in the `examples` folder:
|
| 135 |
+
|
| 136 |
+
- `samples/dave.wav`
|
| 137 |
+
- `samples/jo.wav`
|
| 138 |
+
|
| 139 |
+
### Guidelines for Best Results
|
| 140 |
+
|
| 141 |
+
For optimal performance, reference audio samples should be:
|
| 142 |
+
|
| 143 |
+
1. **Mono channel**
|
| 144 |
+
2. **16-44 kHz sample rate**
|
| 145 |
+
3. **3–15 seconds in length**
|
| 146 |
+
4. **Saved as a `.wav` file**
|
| 147 |
+
5. **Clean** — minimal to no background noise
|
| 148 |
+
6. **Natural, continuous speech** — like a monologue or conversation, with few pauses, so the model can capture tone effectively
|
| 149 |
+
|
| 150 |
+
# **Responsibility**
|
| 151 |
+
|
| 152 |
+
Every audio file generated by NeuTTS Air includes [**Perth (Perceptual Threshold) Watermarker](https://github.com/resemble-ai/perth).**
|
| 153 |
+
|
| 154 |
+
# **Disclaimer**
|
| 155 |
+
|
| 156 |
+
Don't use this model to do bad things… please.
|
config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"eos_token_id": 151645,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 896,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 4864,
|
| 12 |
+
"max_position_embeddings": 32768,
|
| 13 |
+
"max_window_layers": 21,
|
| 14 |
+
"model_type": "qwen2",
|
| 15 |
+
"num_attention_heads": 14,
|
| 16 |
+
"num_hidden_layers": 24,
|
| 17 |
+
"num_key_value_heads": 2,
|
| 18 |
+
"rms_norm_eps": 1e-06,
|
| 19 |
+
"rope_scaling": null,
|
| 20 |
+
"rope_theta": 1000000.0,
|
| 21 |
+
"sliding_window": 32768,
|
| 22 |
+
"tie_word_embeddings": true,
|
| 23 |
+
"torch_dtype": "bfloat16",
|
| 24 |
+
"transformers_version": "4.50.3",
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_sliding_window": false,
|
| 27 |
+
"vocab_size": 217652
|
| 28 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.1,
|
| 10 |
+
"temperature": 0.7,
|
| 11 |
+
"top_k": 20,
|
| 12 |
+
"top_p": 0.8,
|
| 13 |
+
"transformers_version": "4.50.3"
|
| 14 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85c7db53fbe8d62be9bc29a0743661adcb0067552488f185b5f2eb2f1ee4179f
|
| 3 |
+
size 1495893752
|
neutss-air-BF16.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f6d562b881e64feb785a2b0a422eeadea326289fd5614990f9809ae37acd0d7
|
| 3 |
+
size 1503776000
|
neutts-air.png
ADDED
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|im_end|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74c466530bd698626a5b6a424d204711c58dfff0a6b3dd8b4dbac1e1e8c9aa87
|
| 3 |
+
size 24140239
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:364126212a294d794d83036954b0154b925c329411da93e68cdd1addeb4a5bea
|
| 3 |
+
size 12065831
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|