Spaces:

thomwolf
/

Mimi-playground

Running

App Files Files Community

Mimi-playground / app.py

thomwolf HF Staff

update naming

b316611 7 months ago

raw

history blame contribute delete

2.99 kB

	import gradio as gr
	import time

	from huggingface_hub import hf_hub_download
	import numpy as np
	import sphn
	import torch
	import spaces

	from moshi.models import loaders

	import torch._dynamo
	torch._dynamo.config.suppress_errors = True

	device = "cuda" if torch.cuda.device_count() else "cpu"
	num_codebooks = 32

	print("loading mimi")
	model_file = hf_hub_download(loaders.DEFAULT_REPO, "tokenizer-e351c8d8-checkpoint125.safetensors")

	mimi = loaders.get_mimi(model_file, device, num_codebooks=num_codebooks)
	mimi.eval()
	print("mimi loaded")


	@spaces.GPU
	def mimi_streaming_test(input_wave, max_duration_sec=10.0):
	pcm_chunk_size = int(mimi.sample_rate / mimi.frame_rate)
	# wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
	sample_pcm, sample_sr = sphn.read(input_wave) # ("bria.mp3")
	sample_rate = mimi.sample_rate
	print("loaded pcm", sample_pcm.shape, sample_sr)
	sample_pcm = sphn.resample(
	sample_pcm, src_sample_rate=sample_sr, dst_sample_rate=sample_rate
	)
	sample_pcm = torch.tensor(sample_pcm, device=device)
	max_duration_len = int(sample_rate * max_duration_sec)
	if sample_pcm.shape[-1] > max_duration_len:
	sample_pcm = sample_pcm[..., :max_duration_len]
	print("resampled pcm", sample_pcm.shape, sample_sr)
	sample_pcm = sample_pcm[None].to(device=device)

	print("streaming encoding...")
	with torch.no_grad():
	all_codes_th = mimi.encode(sample_pcm)

	print(f"codes {all_codes_th.shape}")

	all_codes_list = [all_codes_th[:, :1, :],
	all_codes_th[:, :2, :],
	all_codes_th[:, :4, :],
	# all_codes_th[:, :8, :],
	# all_codes_th[:, :16, :],
	all_codes_th[:, :32, :]]
	pcm_list = []
	for i, all_codes_th in enumerate(all_codes_list):
	with torch.no_grad():
	print(f"decoding {i+1} codebooks, {all_codes_th.shape}")
	pcm = mimi.decode(all_codes_th)
	pcm_list.append((sample_rate, pcm[0, 0].cpu().numpy()))
	# sphn.write_wav("roundtrip_out.wav", pcm[0, 0].cpu().numpy(), sample_rate)
	return pcm_list


	demo = gr.Interface(
	fn=mimi_streaming_test,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input audio"),
	outputs=[gr.Audio(type="numpy", label="Reconstructed with 1 codebook"),
	gr.Audio(type="numpy", label="Reconstructed with 2 codebooks"),
	gr.Audio(type="numpy", label="Reconstructed with 4 codebooks"),
	# gr.Audio(type="numpy", label="With 8 codebooks"),
	# gr.Audio(type="numpy", label="With 16 codebooks"),
	gr.Audio(type="numpy", label="Reconstructed with 32 codebooks")],
	examples= [["./hello.mp3"]],
	title="Mimi tokenizer playground",
	description="Explore the quality of reconstruction when audio is tokenized using various number of code books in the Mimi model."
	)

	demo.launch()