Uploaded model
- Developed by: Cosmobillian
- License: apache-2.0
- Finetuned from model : unsloth/orpheus-3b-0.1-ft
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
model_name = "Cosmobillian/orpheust-tts-base-fine-tune"
model_name = "canopylabs/orpheus-3b-0.1-ft"
Restart the kernel if needed
from snac import SNAC import torch import torch from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer import numpy as np import soundfile as sf import IPython.display as ipd import librosa from ipywebrtc import AudioRecorder, Audio from IPython.display import display import ipywidgets as widgets from huggingface_hub import snapshot_download import torchaudio.transforms as T import librosa import torch from IPython.display import Audio, display
device = "cuda" if torch.cuda.is_available() else "mps" #or cpu if you aren't on an M type mac print(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
Download only model config and safetensors
model_path = snapshot_download( repo_id=model_name, allow_patterns=[ "config.json", ".safetensors", "model.safetensors.index.json", ], ignore_patterns=[ "optimizer.pt", "pytorch_model.bin", "training_args.bin", "scheduler.pt", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", "tokenizer." ] )
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) model.to(device)
CHANGE THIS TO YOUR OWN FILE AND TEXT
my_wav_file_is = "/content/drive/MyDrive/Colab Notebooks/short15s_sezen_aksu.wav" and_the_transcript_is = "Ayลeciฤin filmi var zeynep deฤirmencioฤlunun hafฤฑzasฤฑnฤฑ kaybediyor yolda birileri buluyorlar "
the_model_should_say = [ "Hayat, her gรผn karลฤฑmฤฑza yeni fฤฑrsatlar ve zorluklar รงฤฑkarฤฑr. รnemli olan, bu anlarฤฑ nasฤฑl deฤerlendirdiฤimizdir. Bazen kรผรงรผk bir adฤฑm bile bรผyรผk deฤiลimlerin baลlangฤฑcฤฑ olabilir. Her sabah yeni bir baลlangฤฑรงtฤฑr; dรผnรผ geride bฤฑrakฤฑp bugรผnรผ en iyi ลekilde deฤerlendirmek elimizde. ฤฐnsan, hedeflerine ulaลmak iรงin kararlฤฑlฤฑkla ilerlemeli ve karลฤฑlaลtฤฑฤฤฑ engellerden yฤฑlmadan yoluna devam etmelidir."
] #@title Tokenising your stuff for the prompt ''' Here we tokenise the prompt you gave us, we also tokenise the prompts you want the model to say
The template is:
start_of_human, start_of_text, text, end_of_text, start_of_ai, start_of_speech, speech, end_of_speech, end_of_ai, start_of_human, text, end_of_human and then generate from here
'''
filename = my_wav_file_is
audio_array, sample_rate = librosa.load(filename, sr=24000)
def tokenise_audio(waveform): waveform = torch.from_numpy(waveform).unsqueeze(0) waveform = waveform.to(dtype=torch.float32)
waveform = waveform.unsqueeze(0)
with torch.inference_mode(): codes = snac_model.encode(waveform)
all_codes = [] for i in range(codes[0].shape[1]): all_codes.append(codes[0][0][i].item()+128266) all_codes.append(codes[1][0][2i].item()+128266+4096) all_codes.append(codes[2][0][4i].item()+128266+(24096)) all_codes.append(codes[2][0][(4i)+1].item()+128266+(34096)) all_codes.append(codes[1][0][(2i)+1].item()+128266+(44096)) all_codes.append(codes[2][0][(4i)+2].item()+128266+(54096)) all_codes.append(codes[2][0][(4i)+3].item()+128266+(6*4096))
return all_codes
myts = tokenise_audio(audio_array) start_tokens = torch.tensor([[ 128259]], dtype=torch.int64) end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64) final_tokens = torch.tensor([[128258, 128262]], dtype=torch.int64) voice_prompt = and_the_transcript_is prompt_tokked = tokenizer(voice_prompt, return_tensors="pt")
input_ids = prompt_tokked["input_ids"]
zeroprompt_input_ids = torch.cat([start_tokens, input_ids, end_tokens, torch.tensor([myts]), final_tokens], dim=1) # SOH SOT Text EOT EOH
prompts = the_model_should_say
all_modified_input_ids = [] for prompt in prompts: input_ids = tokenizer(prompt, return_tensors="pt").input_ids second_input_ids = torch.cat([zeroprompt_input_ids, start_tokens, input_ids, end_tokens], dim=1) all_modified_input_ids.append(second_input_ids)
all_padded_tensors = [] all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids: padding = max_length - modified_input_ids.shape[1] padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1) attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1) all_padded_tensors.append(padded_tensor) all_attention_masks.append(attention_mask)
all_padded_tensors = torch.cat(all_padded_tensors, dim=0) all_attention_masks = torch.cat(all_attention_masks, dim=0)
input_ids = all_padded_tensors.to(device) attention_mask = all_attention_masks.to(device)
#@title Run Inference
with torch.no_grad(): generated_ids = model.generate( input_ids=input_ids, # attention_mask=attention_mask, max_new_tokens=1500, do_sample=True, temperature=0.5, # top_k=40, top_p=0.9, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=128258, # end_token_id=128009 )
generated_ids = torch.cat([generated_ids, torch.tensor([[128262]]).to(device)], dim=1) # EOAI
#@title Convert output to speech token_to_find = 128257 token_to_remove = 128258
Check if the token exists in the tensor
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0: last_occurrence_idx = token_indices[1][-1].item() cropped_tensor = generated_ids[:, last_occurrence_idx+1:] else: cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove processed_rows = [] for row in cropped_tensor: # Apply the mask to each row masked_row = row[row != token_to_remove] processed_rows.append(masked_row)
code_lists = [] for row in processed_rows: # row is a 1D tensor with its own length row_length = row.size(0) new_length = (row_length // 7) * 7 # largest multiple of 7 that fits in this row trimmed_row = row[:new_length] trimmed_row = [t - 128266 for t in trimmed_row] code_lists.append(trimmed_row)
def redistribute_codes(code_list): layer_1 = [] layer_2 = [] layer_3 = [] for i in range((len(code_list)+1)//7): layer_1.append(code_list[7i]) layer_2.append(code_list[7i+1]-4096) layer_3.append(code_list[7i+2]-(24096)) layer_3.append(code_list[7i+3]-(34096)) layer_2.append(code_list[7i+4]-(44096)) layer_3.append(code_list[7i+5]-(54096)) layer_3.append(code_list[7i+6]-(64096)) codes = [torch.tensor(layer_1).unsqueeze(0), torch.tensor(layer_2).unsqueeze(0), torch.tensor(layer_3).unsqueeze(0)] audio_hat = snac_model.decode(codes) return audio_hat
my_samples = [] for code_list in code_lists: samples = redistribute_codes(code_list) my_samples.append(samples)
# Eฤer soundfile yรผklรผ deฤilse รงalฤฑลtฤฑrฤฑn:
!pip install soundfile
import soundfile as sf from IPython.display import Audio, display from google.colab import files
for idx, samples in enumerate(my_samples): # Tensรถrรผ NumPy dizisine รงevir audio = samples.detach().squeeze().cpu().numpy() filename = f'audio_{idx}.wav'
# WAV dosyasฤฑ olarak kaydet
sf.write(filename, audio, 24000)
# Ses oynatฤฑcฤฑyฤฑ gรถster
display(Audio(audio, rate=24000))
# ฤฐndir butonunu รงalฤฑลtฤฑr
files.download(filename)
- Downloads last month
- 6
Model tree for Cosmobillian/orpheust-tts-base-fine-tune
Base model
meta-llama/Llama-3.2-3B-Instruct