Spaces:

ykvns
/

testonly_exaone4.0-1.2B

Runtime error

App Files Files Community

ykvns commited on Jul 16

Commit

ab0b22d

verified ·

1 Parent(s): 80b70e3

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -67

app.py CHANGED Viewed

@@ -1,97 +1,110 @@
 import gradio as gr
-from llama_cpp import Llama
-from transformers import AutoTokenizer
-from huggingface_hub import hf_hub_download
-import os
-# Model paths
 MODEL_REPO_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
-GGUF_REPO_ID = "lmstudio-community/LGAI-EXAONE-4.0-1.2B-GGUF"
-GGUF_FILENAME = "A.X-4.0-Light-imatrix-IQ1_S.gguf"
-TOKENIZER_DIR = "exaone-tokenizer"
-TEMPLATE_PATH = os.path.join(TOKENIZER_DIR, "chat_template.jinja")
-# Download GGUF model
-print("🔄 Downloading GGUF model...")
-model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILENAME)
-print(f"✅ Model downloaded to: {model_path}")
-# Load tokenizer
-print("🔄 Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
-print("✅ Tokenizer loaded.")
 try:
-    with open(TEMPLATE_PATH, "r", encoding="utf-8") as f:
-        tokenizer.chat_template = f.read()
-    print("✅ Chat template loaded.")
 except Exception as e:
-    print(f"Could not load chat_template.jinja: {e}")
-    tokenizer.chat_template = None
-# Load model
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=os.cpu_count(),
-    n_gpu_layers=-1,
-    use_mlock=True,
-    verbose=False
-)
-# Streaming chat function
-def format_prompt(messages):
-    formatted = ""
-    for m in messages:
-        role = m["role"].upper()
-        formatted += f"{role}: {m['content']}\n"
-    return formatted + "ASSISTANT:"
 def user_input_handler(user_message, history):
     return "", history + [[user_message, None]]
 def bot_stream(history):
     user_message = history[-1][0]
-    history[-1][1] = ""
-    # Convert chat history to OpenAI format
-    messages = [{"role": "system", "content": "You are a helpful assistant."}]
-    for human, assistant in history[:-1]:
         messages.append({"role": "user", "content": human})
-        if assistant:
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": user_message})
     try:
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    except Exception:
-        prompt = format_prompt(messages)
-    # Generate streaming output
-    generator = llm.create_completion(
-        prompt=prompt,
-        max_tokens=512,
         temperature=0.7,
         top_p=0.9,
-        stream=True,
-        stop=["</s>", "<|endoftext|>", "USER:", "ASSISTANT:"]
     )
-    for chunk in generator:
-        token = chunk["choices"][0]["text"]
         history[-1][1] += token
         yield history
-# Gradio UI
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🤖 EXAONE-4.0-1.2B Streaming")
-    chatbot = gr.Chatbot(label="Chat History", height=600)
     with gr.Row():
-        msg = gr.Textbox(placeholder="Type a message...", label="Your Message", scale=8)
-        send_btn = gr.Button("Send", scale=1)
-        clear_btn = gr.Button("Clear Chat", scale=1, variant="secondary")
-    # Chat events
     msg.submit(user_input_handler, [msg, chatbot], [msg, chatbot], queue=False).then(
         bot_stream, chatbot, chatbot
     )
@@ -99,7 +112,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         bot_stream, chatbot, chatbot
     )
-    clear_btn.click(lambda: [], None, chatbot)
 demo.queue()
-demo.launch()

 import gradio as gr
+import torch
+from threading import Thread
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# Model and Tokenizer Configuration
 MODEL_REPO_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
+print("✅ Starting application...")
+# Load the model with bfloat16 to save memory
 try:
+    print(f"🔄 Loading tokenizer from '{MODEL_REPO_ID}'...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID)
+    print("✅ Tokenizer loaded successfully.")
+    print(f"🔄 Loading model '{MODEL_REPO_ID}' with torch_dtype=torch.bfloat16...")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_REPO_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    print("✅ Model loaded successfully.")
 except Exception as e:
+    print(f"❌ Error loading model or tokenizer: {e}")
+    # Exit if model fails to load, as the app is unusable.
+    raise
+# Streaming Chat Function
 def user_input_handler(user_message, history):
+    """Handles user input by appending it to the history."""
     return "", history + [[user_message, None]]
 def bot_stream(history):
+    """
+    Generates the bot's response using a streaming approach.
+    This function runs the model in a separate thread to avoid blocking the UI.
+    """
+    print(f"📝 History received: {history}")
+    # The last message is the user's prompt.
     user_message = history[-1][0]
+    history[-1][1] = "" # Initialize the bot's response field.
+    # Format the conversation history into the model's expected chat format.
+    messages = []
+    for human, assistant in history[:-1]: # All but the last interaction
         messages.append({"role": "user", "content": human})
+        if assistant: # Assistant message might be None
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": user_message})
     try:
+        # Apply the chat template to format the prompt.
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception as e:
+        print(f"⚠️ Warning: Could not apply chat template. Using basic formatting. Error: {e}")
+        # Fallback for models without a registered chat template
+        prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) + "\nassistant:"
+    print("➡️ Generated Prompt for Model:\n" + prompt)
+    # Tokenize the formatted prompt.
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Use TextIteratorStreamer for non-blocking, token-by-token generation.
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Set up the generation parameters in a dictionary.
+    generation_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=512,
         temperature=0.7,
         top_p=0.9,
+        do_sample=True,
+        eos_token_id=tokenizer.eos_token_id,
     )
+    # Run the generation in a separate thread to avoid blocking the Gradio UI.
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Yield each new token to the Gradio chat interface as it's generated.
+    for token in streamer:
         history[-1][1] += token
         yield history
+# Gradio User Interface
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="footer {display: none !important}") as demo:
+    gr.Markdown("## 🤖 EXAONE-4.0-1.2B")
+    gr.Markdown("This demo runs the standard `LGAI-EXAONE/EXAONE-4.0-1.2B` model using the `transformers` library.")
+    chatbot = gr.Chatbot(label="Chat History", height=600, bubble_full_width=False)
     with gr.Row():
+        msg = gr.Textbox(
+            placeholder="Type your message here...",
+            label="Your Message",
+            scale=8,
+            autofocus=True,
+        )
+        send_btn = gr.Button("Send", scale=1, variant="primary")
+    clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")
+    # Event Handlers
     msg.submit(user_input_handler, [msg, chatbot], [msg, chatbot], queue=False).then(
         bot_stream, chatbot, chatbot
     )
         bot_stream, chatbot, chatbot
     )
 demo.queue()
+demo.launch(debug=True)