Spaces:

ysharma
/

lemonade-thinking-chatbot

Runtime error

File size: 11,966 Bytes

78752d9

import gradio as gr
from gradio import ChatMessage
from openai import OpenAI
import time

# Configure Lemonade Server connection
base_url = "http://localhost:8000/api/v1"
client = OpenAI(
    base_url=base_url,
    api_key="lemonade",  # required, but unused in Lemonade
)

def stream_chat_response(message: str, history: list, model_name: str, system_prompt: str):
    """
    Stream responses from Lemonade Server and display thinking process separately.
    """
    # Add user message to history
    history.append(ChatMessage(role="user", content=message))
    yield history
    
    # Convert history to OpenAI format - only include actual conversation messages
    messages = []
    
    # Add system prompt if provided
    if system_prompt and system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})
    
    # Convert history, skipping metadata-only messages
    for msg in history:
        if isinstance(msg, ChatMessage):
            # Skip thinking/metadata messages when sending to API
            if msg.metadata and msg.metadata.get("title"):
                continue
            messages.append({
                "role": msg.role,
                "content": msg.content
            })
        elif isinstance(msg, dict):
            # Skip metadata messages
            if msg.get("metadata"):
                continue
            messages.append({
                "role": msg.get("role", "user"),
                "content": msg.get("content", "")
            })
    
    try:
        # Initialize response tracking
        thinking_content = ""
        response_content = ""
        thinking_added = False
        response_added = False
        thinking_start_time = None
        
        # Stream response from Lemonade Server
        stream = client.chat.completions.create(
            model=model_name,
            messages=messages,
            stream=True,
            max_tokens=2048,
            temperature=0.7,
        )
        
        for chunk in stream:
            # Safety check for chunk structure
            if not chunk.choices or len(chunk.choices) == 0:
                continue
            
            if not hasattr(chunk.choices[0], 'delta'):
                continue
            
            delta = chunk.choices[0].delta
            
            # Check for reasoning_content (thinking process)
            reasoning_content = getattr(delta, 'reasoning_content', None)
            # Check for regular content (final answer)
            content = getattr(delta, 'content', None)
            
            # Handle reasoning/thinking content
            if reasoning_content:
                if not thinking_added:
                    # Add thinking section
                    thinking_start_time = time.time()
                    history.append(ChatMessage(
                        role="assistant",
                        content="",
                        metadata={
                            "title": "🧠 Thought Process",
                            "status": "pending"
                        }
                    ))
                    thinking_added = True
                
                # Accumulate thinking content
                thinking_content += reasoning_content
                history[-1] = ChatMessage(
                    role="assistant",
                    content=thinking_content,
                    metadata={
                        "title": "🧠 Thought Process",
                        "status": "pending"
                    }
                )
                yield history
            
            # Handle regular content (final answer)
            elif content:
                # Finalize thinking section if it exists
                if thinking_added and thinking_start_time:
                    elapsed = time.time() - thinking_start_time
                    # Update the thinking message to "done" status
                    for i in range(len(history) - 1, -1, -1):
                        if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
                            history[i] = ChatMessage(
                                role="assistant",
                                content=thinking_content,
                                metadata={
                                    "title": "🧠 Thought Process",
                                    "status": "done",
                                    "duration": elapsed
                                }
                            )
                            break
                    thinking_start_time = None
                
                # Add or update response content
                if not response_added:
                    history.append(ChatMessage(
                        role="assistant",
                        content=""
                    ))
                    response_added = True
                
                response_content += content
                history[-1] = ChatMessage(
                    role="assistant",
                    content=response_content
                )
                yield history
        
        # Final check: if thinking section exists but wasn't finalized
        if thinking_added and thinking_start_time:
            elapsed = time.time() - thinking_start_time
            for i in range(len(history) - 1, -1, -1):
                if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
                    history[i] = ChatMessage(
                        role="assistant",
                        content=thinking_content,
                        metadata={
                            "title": "🧠 Thought Process",
                            "status": "done",
                            "duration": elapsed
                        }
                    )
                    break
            yield history
            
    except Exception as e:
        import traceback
        error_msg = str(e)
        error_trace = traceback.format_exc()
        
        # Try to extract more details from the error
        if "422" in error_msg:
            error_details = f"""
⚠️ **Request Validation Error**

The server rejected the request. Possible issues:
- Model name might be incorrect (currently: `{model_name}`)
- Check that the model is loaded on the server
- Try simplifying the system prompt

**Error:** {error_msg}
"""
        elif "list index out of range" in error_msg or "IndexError" in error_trace:
            error_details = f"""
⚠️ **Streaming Response Error**

There was an issue processing the streaming response.

**Debug Info:**
- Model: `{model_name}`
- Base URL: `{base_url}`
- Error: {error_msg}

Try refreshing and sending another message.
"""
        else:
            error_details = f"""
⚠️ **Connection Error**

Error: {error_msg}

Make sure:
1. Lemonade Server is running at `{base_url}`
2. Model `{model_name}` is loaded
3. The server is accessible

**Debug trace:**
```
{error_trace[-500:]}
```
"""
        
        history.append(ChatMessage(
            role="assistant",
            content=error_details,
            metadata={
                "title": "⚠️ Error Details"
            }
        ))
        yield history


def clear_chat():
    """Clear the chat history."""
    return []


# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    # Define input textbox first so it can be referenced in Examples
    msg = gr.Textbox(
        placeholder="Type your message here and press Enter...",
        show_label=False,
        container=False,
        render=False  # Don't render yet, will be rendered in main area
    )
    
    # Sidebar for settings and information
    with gr.Sidebar(position="left", open=True):
        gr.Markdown("""
        # 🍋 Lemonade Reasoning Chatbot
        Chat with local LLMs running on AMD Lemonade Server. This interface beautifully displays the model's thinking process!
        """)
        
        gr.Markdown("### ⚙️ Settings")
        
        model_dropdown = gr.Dropdown(
            choices=[
                "Qwen3-0.6B-GGUF",
                "Llama-3.1-8B-Instruct-Hybrid",
                "Qwen2.5-7B-Instruct",
                "Phi-3.5-mini-instruct",
                "Meta-Llama-3-8B-Instruct"
            ],
            value="Qwen3-0.6B-GGUF",
            label="Model",
            info="Select the LLM model to use",
            allow_custom_value=True
        )
        
        system_prompt = gr.Textbox(
            label="System Prompt (Optional)",
            value="You are a helpful assistant.",
            lines=3,
            info="Customize the model's behavior",
            placeholder="Leave empty to use model defaults"
        )
        
        # How Thinking Works Accordion
        with gr.Accordion("💡 How Thinking Works", open=False):
            gr.Markdown("""
            - Reasoning models output `reasoning_content` (thinking) and `content` (final answer) separately
            - Thinking appears in a collapsible "🧠 Thought Process" section
            - Duration of thinking is displayed automatically
            - Works with models like: DeepSeek-R1, QwQ, and other reasoning models
            """)
        
        # Current Model Accordion
        with gr.Accordion("📋 Current Model", open=False):
            gr.Markdown("""
            Make sure your model supports reasoning output for thinking to be displayed.
            """)
        
        # Example Prompts Accordion
        with gr.Accordion("📝 Example Prompts", open=False):
            gr.Markdown("""
            - "Solve: If a train travels 120 km in 2 hours, what's its speed?"
            - "Compare pros and cons of electric vs gas cars"
            - "Explain step-by-step how to make coffee"
            - "What's the difference between AI and ML?"
            """)
        
        # Add example interactions in sidebar
        gr.Examples(
            examples=[
                "What is 15 + 24?",
                "Write a short poem about AI",
                "What is the capital of Japan?",
                "Explain what machine learning is in simple terms"
            ],
            inputs=msg,
            label="Quick Examples"
        )
    
    # Main chat area - full screen
    chatbot = gr.Chatbot(
        type="messages",
        label="Chat",
        height="calc(100vh - 200px)",
        avatar_images=(
            "https://em-content.zobj.net/source/twitter/376/bust-in-silhouette_1f464.png",
            "https://em-content.zobj.net/source/twitter/376/robot_1f916.png"
        ),
        show_label=False,
        #placeholder="C:\Users\Yuvi\dev\testing\placeholder.png"
        placeholder= #"""
        #<div style="display: flex; justify-content: center; align-items: center; height: 100%;">
        #    <img src="/gradio_api/file=C:\\Users\\Yuvi\\dev\\testing\\placeholder.png" style="opacity: 0.4; max-width: 80%; max-height: 80%; object-fit: contain;" alt="Placeholder">
        #</div>
        #"""
        """<div>
        <img src="/gradio_api/file=placeholder.png"> 
        </div>"""
    )
    
    # Render the input textbox in main area
    msg.render()
    
    # Event handlers - only submit event
    def submit_message(message, history, model, sys_prompt):
        """Wrapper to handle message submission"""
        if not message or message.strip() == "":
            return history, ""
        yield from stream_chat_response(message, history, model, sys_prompt)
    
    msg.submit(
        submit_message,
        inputs=[msg, chatbot, model_dropdown, system_prompt],
        outputs=chatbot
    ).then(
        lambda: "",
        None,
        msg
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(allowed_paths=["."], ssr_mode=True)