Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,7 @@ from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_E
|
|
| 25 |
from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
|
| 26 |
from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
|
| 27 |
|
|
|
|
| 28 |
HF_API_TOKEN = os.getenv("HF_API_KEY")
|
| 29 |
ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
|
| 30 |
NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
|
|
@@ -93,54 +94,50 @@ def replicate_api_inference(input_prompt):
|
|
| 93 |
return image
|
| 94 |
|
| 95 |
def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
|
| 96 |
-
if input_prompt!="":
|
| 97 |
-
if
|
| 98 |
-
|
|
|
|
| 99 |
else:
|
| 100 |
-
|
| 101 |
-
print("using
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
image = replicate_api_inference(input_prompt)
|
| 103 |
-
|
| 104 |
-
try:
|
| 105 |
-
print("using HF inference API for image generation")
|
| 106 |
-
image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
|
| 107 |
-
image = np.array(Image.open(io.BytesIO(image_bytes)))
|
| 108 |
-
except Exception as e:
|
| 109 |
-
print("HF API error:", e)
|
| 110 |
-
# generate image with help replicate in case of error
|
| 111 |
-
image = replicate_api_inference(input_prompt)
|
| 112 |
-
return image
|
| 113 |
else:
|
| 114 |
return None
|
| 115 |
|
| 116 |
def generate_img_prompt(input_prompt):
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
|
| 125 |
-
text = f"""
|
| 126 |
-
Translate the given input prompt to English.
|
| 127 |
-
Input Prompt: {input_prompt}
|
| 128 |
-
|
| 129 |
-
Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
|
| 130 |
-
Do not use more than 3-4 lines for the image description. Respond with only the image description.
|
| 131 |
-
"""
|
| 132 |
else:
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
Input Prompt: {input_prompt}
|
| 136 |
-
|
| 137 |
-
Do not use more than 3-4 lines for the description.
|
| 138 |
-
"""
|
| 139 |
-
|
| 140 |
-
response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
|
| 141 |
-
output = response.text
|
| 142 |
-
|
| 143 |
-
return output
|
| 144 |
|
| 145 |
|
| 146 |
# Chat with Aya util functions
|
|
@@ -151,7 +148,8 @@ def trigger_example(example):
|
|
| 151 |
|
| 152 |
def generate_aya_chat_response(user_message, cid, token, history=None):
|
| 153 |
if not token:
|
| 154 |
-
|
|
|
|
| 155 |
|
| 156 |
if history is None:
|
| 157 |
history = []
|
|
@@ -186,7 +184,7 @@ def clear_chat():
|
|
| 186 |
|
| 187 |
# Audio Pipeline util functions
|
| 188 |
|
| 189 |
-
def transcribe_and_stream(inputs,
|
| 190 |
if inputs is not None and inputs!="":
|
| 191 |
if show_info=="show_info":
|
| 192 |
gr.Info("Processing Audio", duration=1)
|
|
@@ -242,11 +240,10 @@ def convert_text_to_speech(text, language="english"):
|
|
| 242 |
# clean text before doing language detection
|
| 243 |
cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
|
| 244 |
text_lang_code = predict_language(cleaned_text)
|
| 245 |
-
language = LID_LANGUAGES[text_lang_code]
|
| 246 |
|
| 247 |
if not USE_ELVENLABS:
|
| 248 |
-
if
|
| 249 |
-
audio_path = neetsai_tts(text,
|
| 250 |
else:
|
| 251 |
print("DEVICE:", DEVICE)
|
| 252 |
# if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
|
|
@@ -274,10 +271,16 @@ def elevenlabs_generate_audio(text):
|
|
| 274 |
save(audio, audio_path)
|
| 275 |
return audio_path
|
| 276 |
|
| 277 |
-
def neetsai_tts(input_text,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
-
|
| 280 |
-
neets_vits_voice_id = f"vits-{
|
| 281 |
|
| 282 |
response = requests.request(
|
| 283 |
method="POST",
|
|
@@ -344,7 +347,7 @@ with demo:
|
|
| 344 |
**Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
|
| 345 |
"""
|
| 346 |
)
|
| 347 |
-
|
| 348 |
with gr.TabItem("Chat with Aya") as chat_with_aya:
|
| 349 |
cid = gr.State("")
|
| 350 |
token = gr.State(value=None)
|
|
@@ -385,12 +388,13 @@ with demo:
|
|
| 385 |
example_labels=TEXT_CHAT_EXAMPLES_LABELS,
|
| 386 |
)
|
| 387 |
|
| 388 |
-
#
|
| 389 |
with gr.TabItem("Speak with Aya") as speak_with_aya:
|
| 390 |
|
| 391 |
with gr.Row():
|
| 392 |
with gr.Column():
|
| 393 |
e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
|
|
|
|
| 394 |
|
| 395 |
clear_button_microphone = gr.ClearButton()
|
| 396 |
gr.Examples(
|
|
@@ -407,14 +411,14 @@ with demo:
|
|
| 407 |
e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
|
| 408 |
e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
|
| 409 |
|
| 410 |
-
show_info = gr.Textbox(value="show_info", visible=False)
|
| 411 |
-
stt_model = gr.Textbox(value="groq_whisper", visible=False)
|
| 412 |
|
| 413 |
with gr.Accordion("See Details", open=False):
|
| 414 |
gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
|
| 415 |
|
| 416 |
|
| 417 |
-
#
|
| 418 |
with gr.TabItem("Visualize with Aya") as visualize_with_aya:
|
| 419 |
with gr.Row():
|
| 420 |
with gr.Column():
|
|
@@ -465,31 +469,33 @@ with demo:
|
|
| 465 |
generate_image, #run_flux,
|
| 466 |
inputs=[generated_img_desc],
|
| 467 |
outputs=[generated_img],
|
| 468 |
-
show_progress="
|
| 469 |
)
|
| 470 |
|
| 471 |
# Audio Pipeline
|
| 472 |
clear_button_microphone.click(lambda: None, None, e2e_audio_file)
|
| 473 |
-
clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
|
| 474 |
clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
|
|
|
|
|
|
|
| 475 |
|
| 476 |
-
e2e_audio_file.change(
|
|
|
|
| 477 |
transcribe_and_stream,
|
| 478 |
-
inputs=[e2e_audio_file
|
| 479 |
outputs=[e2e_audio_file_trans],
|
| 480 |
-
show_progress="
|
| 481 |
).then(
|
| 482 |
aya_speech_text_response,
|
| 483 |
inputs=[e2e_audio_file_trans],
|
| 484 |
outputs=[e2e_audio_file_aya_response],
|
| 485 |
-
show_progress="
|
| 486 |
).then(
|
| 487 |
convert_text_to_speech,
|
| 488 |
inputs=[e2e_audio_file_aya_response],
|
| 489 |
outputs=[e2e_aya_audio_response],
|
| 490 |
-
show_progress="
|
| 491 |
)
|
| 492 |
|
| 493 |
demo.load(lambda: secrets.token_hex(16), None, token)
|
| 494 |
|
| 495 |
-
demo.queue(api_open=False, max_size=
|
|
|
|
| 25 |
from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
|
| 26 |
from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
|
| 27 |
|
| 28 |
+
|
| 29 |
HF_API_TOKEN = os.getenv("HF_API_KEY")
|
| 30 |
ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
|
| 31 |
NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
|
|
|
|
| 94 |
return image
|
| 95 |
|
| 96 |
def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
|
| 97 |
+
if input_prompt is not None and input_prompt!="":
|
| 98 |
+
if USE_REPLICATE:
|
| 99 |
+
print("using replicate for image generation")
|
| 100 |
+
image = replicate_api_inference(input_prompt)
|
| 101 |
else:
|
| 102 |
+
try:
|
| 103 |
+
print("using HF inference API for image generation")
|
| 104 |
+
image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
|
| 105 |
+
image = np.array(Image.open(io.BytesIO(image_bytes)))
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print("HF API error:", e)
|
| 108 |
+
# generate image with help replicate in case of error
|
| 109 |
image = replicate_api_inference(input_prompt)
|
| 110 |
+
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
else:
|
| 112 |
return None
|
| 113 |
|
| 114 |
def generate_img_prompt(input_prompt):
|
| 115 |
+
if input_prompt is not None and input_prompt!="":
|
| 116 |
+
# clean prompt before doing language detection
|
| 117 |
+
cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
|
| 118 |
+
text_lang_code = predict_language(cleaned_prompt)
|
| 119 |
+
|
| 120 |
+
gr.Info("Generating Image", duration=2)
|
| 121 |
+
|
| 122 |
+
if text_lang_code!="eng_Latn":
|
| 123 |
+
text = f"""
|
| 124 |
+
Translate the given input prompt to English.
|
| 125 |
+
Input Prompt: {input_prompt}
|
| 126 |
+
Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
|
| 127 |
+
Do not use more than 3-4 lines for the image description. Respond with only the image description.
|
| 128 |
+
"""
|
| 129 |
+
else:
|
| 130 |
+
text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
|
| 131 |
+
Input Prompt: {input_prompt}
|
| 132 |
+
Do not use more than 3-4 lines for the description.
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
|
| 136 |
+
output = response.text
|
| 137 |
|
| 138 |
+
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
else:
|
| 140 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
# Chat with Aya util functions
|
|
|
|
| 148 |
|
| 149 |
def generate_aya_chat_response(user_message, cid, token, history=None):
|
| 150 |
if not token:
|
| 151 |
+
print("no token")
|
| 152 |
+
#raise gr.Error("Error loading.")
|
| 153 |
|
| 154 |
if history is None:
|
| 155 |
history = []
|
|
|
|
| 184 |
|
| 185 |
# Audio Pipeline util functions
|
| 186 |
|
| 187 |
+
def transcribe_and_stream(inputs, model_name="groq_whisper", show_info="show_info", language="english"):
|
| 188 |
if inputs is not None and inputs!="":
|
| 189 |
if show_info=="show_info":
|
| 190 |
gr.Info("Processing Audio", duration=1)
|
|
|
|
| 240 |
# clean text before doing language detection
|
| 241 |
cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
|
| 242 |
text_lang_code = predict_language(cleaned_text)
|
|
|
|
| 243 |
|
| 244 |
if not USE_ELVENLABS:
|
| 245 |
+
if text_lang_code!= "jpn_Jpan":
|
| 246 |
+
audio_path = neetsai_tts(text, text_lang_code)
|
| 247 |
else:
|
| 248 |
print("DEVICE:", DEVICE)
|
| 249 |
# if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
|
|
|
|
| 271 |
save(audio, audio_path)
|
| 272 |
return audio_path
|
| 273 |
|
| 274 |
+
def neetsai_tts(input_text, text_lang_code):
|
| 275 |
+
|
| 276 |
+
if text_lang_code in LID_LANGUAGES.keys():
|
| 277 |
+
language = LID_LANGUAGES[text_lang_code]
|
| 278 |
+
else:
|
| 279 |
+
# use english voice as default for languages outside 23 languages of Aya Expanse
|
| 280 |
+
language = "english"
|
| 281 |
|
| 282 |
+
neets_lang_id = NEETS_AI_LANGID_MAP[language]
|
| 283 |
+
neets_vits_voice_id = f"vits-{neets_lang_id}"
|
| 284 |
|
| 285 |
response = requests.request(
|
| 286 |
method="POST",
|
|
|
|
| 347 |
**Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
|
| 348 |
"""
|
| 349 |
)
|
| 350 |
+
|
| 351 |
with gr.TabItem("Chat with Aya") as chat_with_aya:
|
| 352 |
cid = gr.State("")
|
| 353 |
token = gr.State(value=None)
|
|
|
|
| 388 |
example_labels=TEXT_CHAT_EXAMPLES_LABELS,
|
| 389 |
)
|
| 390 |
|
| 391 |
+
# End to End Testing Pipeline for speak with Aya
|
| 392 |
with gr.TabItem("Speak with Aya") as speak_with_aya:
|
| 393 |
|
| 394 |
with gr.Row():
|
| 395 |
with gr.Column():
|
| 396 |
e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
|
| 397 |
+
e2_audio_submit_button = gr.Button(value="Get Aya's Response", variant="primary")
|
| 398 |
|
| 399 |
clear_button_microphone = gr.ClearButton()
|
| 400 |
gr.Examples(
|
|
|
|
| 411 |
e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
|
| 412 |
e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
|
| 413 |
|
| 414 |
+
# show_info = gr.Textbox(value="show_info", visible=False)
|
| 415 |
+
# stt_model = gr.Textbox(value="groq_whisper", visible=False)
|
| 416 |
|
| 417 |
with gr.Accordion("See Details", open=False):
|
| 418 |
gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
|
| 419 |
|
| 420 |
|
| 421 |
+
# Generate Images
|
| 422 |
with gr.TabItem("Visualize with Aya") as visualize_with_aya:
|
| 423 |
with gr.Row():
|
| 424 |
with gr.Column():
|
|
|
|
| 469 |
generate_image, #run_flux,
|
| 470 |
inputs=[generated_img_desc],
|
| 471 |
outputs=[generated_img],
|
| 472 |
+
show_progress="full",
|
| 473 |
)
|
| 474 |
|
| 475 |
# Audio Pipeline
|
| 476 |
clear_button_microphone.click(lambda: None, None, e2e_audio_file)
|
|
|
|
| 477 |
clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
|
| 478 |
+
clear_button_microphone.click(lambda: None, None, e2e_audio_file_aya_response)
|
| 479 |
+
clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
|
| 480 |
|
| 481 |
+
#e2e_audio_file.change(
|
| 482 |
+
e2_audio_submit_button.click(
|
| 483 |
transcribe_and_stream,
|
| 484 |
+
inputs=[e2e_audio_file],
|
| 485 |
outputs=[e2e_audio_file_trans],
|
| 486 |
+
show_progress="full",
|
| 487 |
).then(
|
| 488 |
aya_speech_text_response,
|
| 489 |
inputs=[e2e_audio_file_trans],
|
| 490 |
outputs=[e2e_audio_file_aya_response],
|
| 491 |
+
show_progress="full",
|
| 492 |
).then(
|
| 493 |
convert_text_to_speech,
|
| 494 |
inputs=[e2e_audio_file_aya_response],
|
| 495 |
outputs=[e2e_aya_audio_response],
|
| 496 |
+
show_progress="full",
|
| 497 |
)
|
| 498 |
|
| 499 |
demo.load(lambda: secrets.token_hex(16), None, token)
|
| 500 |
|
| 501 |
+
demo.queue(api_open=False, max_size=20, default_concurrency_limit=4).launch(show_api=False, allowed_paths=['/home/user/app'])
|