Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

pratikbhavsar commited on Mar 5

Commit

f2625cd

1 Parent(s): d6c509d

reverted to working

Browse files

Files changed (6) hide show

app.py +20 -14
requirements.txt +1 -1
tabs/data_exploration.py +371 -371
tabs/leaderboard.py +48 -551
tabs/model_comparison.py +23 -117
visualization.py +256 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import warnings
 warnings.filterwarnings("ignore")
@@ -19,41 +20,46 @@ from tabs.data_exploration import create_exploration_tab, filter_and_display
 def create_app():
     df = load_data()
     MODELS = [x.strip() for x in df["Model"].unique().tolist()]
     with gr.Blocks(
         theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
     ) as app:
-        with gr.Tabs() as tabs:
-            with gr.Tab("Leaderboard", id=0) as tab1:
-                lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
-                    df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
-                )
-            with gr.Tab("Model Comparison", id=1) as tab2:
-                mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
-            with gr.Tab("Data Exploration", id=2) as tab3:
-                exp_outputs = create_exploration_tab(df)
-        # Initial data loading
-        tab1.select(
             fn=lambda: filter_leaderboard(
                 df, "All", list(CATEGORIES.keys())[0], "Performance"
             ),
             outputs=[lb_output, lb_plot1, lb_plot2],
         )
-        tab2.select(
             fn=lambda: compare_models(
                 df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
             ),
             outputs=[mc_info, mc_plot],
         )
-        tab3.select(
             fn=lambda: filter_and_display(
-                MODELS[0], DATASETS[0], min(SCORES), max(SCORES), 0, 0, 0
             ),
             outputs=exp_outputs[:-1],
         )

+# Add this at the top of your script
 import warnings
 warnings.filterwarnings("ignore")
 def create_app():
     df = load_data()
     MODELS = [x.strip() for x in df["Model"].unique().tolist()]
     with gr.Blocks(
         theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
     ) as app:
+        with gr.Tabs():
+            # Create tabs
+            lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
+                df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
+            )
+            mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
+            exp_outputs = create_exploration_tab(df)
+        # Initial loads
+        app.load(
             fn=lambda: filter_leaderboard(
                 df, "All", list(CATEGORIES.keys())[0], "Performance"
             ),
             outputs=[lb_output, lb_plot1, lb_plot2],
         )
+        app.load(
             fn=lambda: compare_models(
                 df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
             ),
             outputs=[mc_info, mc_plot],
         )
+        app.load(
             fn=lambda: filter_and_display(
+                MODELS[0],
+                DATASETS[0],
+                min(SCORES),
+                max(SCORES),
+                0,
+                0,
+                0,
             ),
             outputs=exp_outputs[:-1],
         )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==5.20.0
 pandas
 matplotlib
 plotly

+gradio==5.18.0
 pandas
 matplotlib
 plotly

tabs/data_exploration.py CHANGED Viewed

@@ -395,292 +395,305 @@ def create_exploration_tab(df):
     """Create an enhanced data exploration tab with better UI and functionality."""
     # Main UI setup
-    # with gr.Tab("Data Exploration"):
-    # CSS styling (unchanged)
-    gr.HTML(
-        """
-    <style>
-        /* Custom styling for the exploration tab */
-        :root[data-theme="light"] {
-            --surface-color: #f8f9fa;
-            --surface-color-alt: #ffffff;
-            --text-color: #202124;
-            --text-muted: #666666;
-            --primary-text: #1a73e8;
-            --primary-text-light: rgba(26, 115, 232, 0.3);
-            --border-color: #e9ecef;
-            --border-color-light: #f1f3f5;
-            --shadow-color: rgba(0,0,0,0.05);
-            --message-bg-user: #E5F6FD;
-            --message-bg-assistant: #F7F7F8;
-            --message-bg-system: #FFF3E0;
-            --response-bg: #F0F7FF;
-            --score-high: #1a73e8;
-            --score-med: #f4b400;
-            --score-low: #ea4335;
-        }
-        :root[data-theme="dark"] {
-            --surface-color: #1e1e1e;
-            --surface-color-alt: #2d2d2d;
-            --text-color: #ffffff;
-            --text-muted: #a0a0a0;
-            --primary-text: #60a5fa;
-            --primary-text-light: rgba(96, 165, 250, 0.3);
-            --border-color: #404040;
-            --border-color-light: #333333;
-            --shadow-color: rgba(0,0,0,0.2);
-            --message-bg-user: #2d3748;
-            --message-bg-assistant: #1a1a1a;
-            --message-bg-system: #2c2516;
-            --response-bg: #1e2a3a;
-            --score-high: #60a5fa;
-            --score-med: #fbbf24;
-            --score-low: #ef4444;
-        }
-        #exploration-header {
-            margin-bottom: 1.5rem;
-            padding-bottom: 1rem;
-            border-bottom: 1px solid var(--border-color);
-        }
-        .filter-container {
-            background-color: var(--surface-color);
-            border-radius: 10px;
-            padding: 1rem;
-            margin-bottom: 1.5rem;
-            border: 1px solid var(--border-color);
-            box-shadow: 0 2px 6px var(--shadow-color);
-        }
-        .navigation-buttons button {
-            min-width: 120px;
-            font-weight: 500;
-        }
-        .content-panel {
-            margin-top: 1.5rem;
-        }
-        @media (max-width: 768px) {
-            .filter-row {
-                flex-direction: column;
             }
-        }
-    </style>
-    """
-    )
-    # Header
-    with gr.Row(elem_id="exploration-header"):
-        gr.HTML(HEADER_CONTENT)
-    # Filters section
-    with gr.Column(elem_classes="filter-container"):
-        gr.Markdown("### 🔍 Filter Options")
-        with gr.Row(equal_height=True, elem_classes="filter-row"):
-            explore_model = gr.Dropdown(
-                choices=MODELS,
-                value=MODELS[0],
-                label="Model",
-                container=True,
-                scale=1,
-                info="Select AI model",
-            )
-            explore_dataset = gr.Dropdown(
-                choices=DATASETS,
-                value=DATASETS[0],
-                label="Dataset",
-                container=True,
-                scale=1,
-                info="Select evaluation dataset",
-            )
-        with gr.Row(equal_height=True, elem_classes="filter-row"):
-            min_score = gr.Slider(
-                minimum=float(min(SCORES)),
-                maximum=float(max(SCORES)),
-                value=float(min(SCORES)),
-                step=0.1,
-                label="Minimum TSQ Score",
-                container=True,
-                scale=1,
-                info="Filter responses with scores above this threshold",
-            )
-            max_score = gr.Slider(
-                minimum=float(min(SCORES)),
-                maximum=float(max(SCORES)),
-                value=float(max(SCORES)),
-                step=0.1,
-                label="Maximum TSQ Score",
-                container=True,
-                scale=1,
-                info="Filter responses with scores below this threshold",
-            )
-        # Get the data for initial ranges
-        df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
-        # Ensure columns exist and get ranges
-        n_turns_max = int(df_chat["n_turns"].max())
-        len_query_max = int(df_chat["len_query"].max())
-        n_tools_max = int(df_chat["n_tools"].max())
-        with gr.Row(equal_height=True, elem_classes="filter-row"):
-            n_turns_filter = gr.Slider(
-                minimum=0,
-                maximum=n_turns_max,
-                value=0,
-                step=1,
-                label="Minimum Turn Count",
-                container=True,
-                scale=1,
-                info="Filter by minimum number of conversation turns",
-            )
-            len_query_filter = gr.Slider(
-                minimum=0,
-                maximum=len_query_max,
-                value=0,
-                step=10,
-                label="Minimum Query Length",
-                container=True,
-                scale=1,
-                info="Filter by minimum length of query in characters",
-            )
-            n_tools_filter = gr.Slider(
-                minimum=0,
-                maximum=n_tools_max,
-                value=0,
-                step=1,
-                label="Minimum Tool Count",
-                container=True,
-                scale=1,
-                info="Filter by minimum number of tools used",
-            )
         with gr.Row():
-            reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
-    # Navigation row
-    with gr.Row(variant="panel"):
-        with gr.Column(scale=1):
-            prev_btn = gr.Button(
-                "← Previous",
-                size="lg",
-                variant="secondary",
-                elem_classes="navigation-buttons",
-            )
-        with gr.Column(scale=1, min_width=100):
-            # Get initial count from default data
-            df_initial = get_chat_and_score_df(MODELS[0], DATASETS[0])
-            initial_count = len(df_initial)
-            index_display = gr.HTML(
-                value=f"""<div style="
-                    display: flex;
-                    align-items: center;
-                    justify-content: center;
-                    font-weight: 500;
-                    color: var(--primary-text);
-                    background-color: var(--surface-color-alt);
-                    padding: 0.5rem 1rem;
-                    border-radius: 20px;
-                    font-size: 0.9rem;
-                    width: fit-content;
-                    margin: 0 auto;">
-                    <span style="margin-right: 0.5rem;">📄</span>1/{initial_count}
-                </div>""",
-                elem_id="index-display",
-            )
-        with gr.Column(scale=1):
-            next_btn = gr.Button(
-                "Next →",
-                size="lg",
-                variant="secondary",
-                elem_classes="navigation-buttons",
             )
-    # Content areas
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            chat_display = gr.HTML()
-        with gr.Column(scale=1):
-            metrics_display = gr.HTML()
-    with gr.Row():
-        tool_info_display = gr.HTML()
-    # State for tracking current index (simple integer state)
-    current_index = gr.State(value=0)
-    def reset_index():
-        """Reset the current index to 0"""
-        return 0
-    # Add these explicit event handlers for model and dataset changes
-    explore_model.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    explore_dataset.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    min_score.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    max_score.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    n_turns_filter.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    len_query_filter.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    n_tools_filter.change(
-        reset_index,
-        inputs=[],
-        outputs=[current_index],
-    )
-    # Reset filters
-    def reset_filters():
-        return (
-            MODELS[0],
-            DATASETS[0],
-            float(min(SCORES)),
-            float(max(SCORES)),
-            0,  # n_turns
-            0,  # len_query
-            0,  # n_tools
         )
-    reset_btn.click(
-        reset_filters,
-        outputs=[
             explore_model,
             explore_dataset,
             min_score,
@@ -688,23 +701,31 @@ def create_exploration_tab(df):
             n_turns_filter,
             len_query_filter,
             n_tools_filter,
-        ],
-    )
-    # Connect filter changes
-    # Replace the existing filter connections with this:
-    for control in [
-        explore_model,
-        explore_dataset,
-        min_score,
-        max_score,
-        n_turns_filter,
-        len_query_filter,
-        n_tools_filter,
-    ]:
-        control.change(
-            on_filter_change,
             inputs=[
                 explore_model,
                 explore_dataset,
                 min_score,
@@ -718,93 +739,72 @@ def create_exploration_tab(df):
                 metrics_display,
                 tool_info_display,
                 index_display,
             ],
         )
-    # Connect navigation buttons with necessary filter parameters
-    prev_btn.click(
-        navigate_prev,
-        inputs=[
-            current_index,
-            explore_model,
-            explore_dataset,
-            min_score,
-            max_score,
-            n_turns_filter,
-            len_query_filter,
-            n_tools_filter,
-        ],
-        outputs=[
-            chat_display,
-            metrics_display,
-            tool_info_display,
-            index_display,
-            current_index,
-        ],
-    )
-    next_btn.click(
-        navigate_next,
-        inputs=[
-            current_index,
-            explore_model,
-            explore_dataset,
-            min_score,
-            max_score,
-            n_turns_filter,
-            len_query_filter,
-            n_tools_filter,
-        ],
-        outputs=[
-            chat_display,
-            metrics_display,
-            tool_info_display,
-            index_display,
-            current_index,
-        ],
-    )
-    def update_slider_ranges(model, dataset):
-        df_chat = get_chat_and_score_df(model, dataset)
-        # Make sure columns are numeric first
-        df_chat["n_turns"] = pd.to_numeric(df_chat["n_turns"], errors="coerce").fillna(
-            0
-        )
-        df_chat["len_query"] = pd.to_numeric(
-            df_chat["len_query"], errors="coerce"
-        ).fillna(0)
-        df_chat["n_tools"] = pd.to_numeric(df_chat["n_tools"], errors="coerce").fillna(
-            0
         )
-        # Calculate maximums with safety buffers
-        n_turns_max = max(1, int(df_chat["n_turns"].max()))
-        len_query_max = max(10, int(df_chat["len_query"].max()))
-        n_tools_max = max(1, int(df_chat["n_tools"].max()))
-        # Return updated sliders using gr.update()
-        return (
-            gr.update(maximum=n_turns_max, value=0),
-            gr.update(maximum=len_query_max, value=0),
-            gr.update(maximum=n_tools_max, value=0),
         )
-    # Connect model and dataset changes to slider range updates
-    explore_model.change(
-        update_slider_ranges,
-        inputs=[explore_model, explore_dataset],
-        outputs=[n_turns_filter, len_query_filter, n_tools_filter],
-    )
-    explore_dataset.change(
-        update_slider_ranges,
-        inputs=[explore_model, explore_dataset],
-        outputs=[n_turns_filter, len_query_filter, n_tools_filter],
-    )
-    return [
-        chat_display,
-        metrics_display,
-        tool_info_display,
-        index_display,
-    ]

     """Create an enhanced data exploration tab with better UI and functionality."""
     # Main UI setup
+    with gr.Tab("Data Exploration"):
+        # CSS styling (unchanged)
+        gr.HTML(
+            """
+        <style>
+            /* Custom styling for the exploration tab */
+            :root[data-theme="light"] {
+                --surface-color: #f8f9fa;
+                --surface-color-alt: #ffffff;
+                --text-color: #202124;
+                --text-muted: #666666;
+                --primary-text: #1a73e8;
+                --primary-text-light: rgba(26, 115, 232, 0.3);
+                --border-color: #e9ecef;
+                --border-color-light: #f1f3f5;
+                --shadow-color: rgba(0,0,0,0.05);
+                --message-bg-user: #E5F6FD;
+                --message-bg-assistant: #F7F7F8;
+                --message-bg-system: #FFF3E0;
+                --response-bg: #F0F7FF;
+                --score-high: #1a73e8;
+                --score-med: #f4b400;
+                --score-low: #ea4335;
             }
+            :root[data-theme="dark"] {
+                --surface-color: #1e1e1e;
+                --surface-color-alt: #2d2d2d;
+                --text-color: #ffffff;
+                --text-muted: #a0a0a0;
+                --primary-text: #60a5fa;
+                --primary-text-light: rgba(96, 165, 250, 0.3);
+                --border-color: #404040;
+                --border-color-light: #333333;
+                --shadow-color: rgba(0,0,0,0.2);
+                --message-bg-user: #2d3748;
+                --message-bg-assistant: #1a1a1a;
+                --message-bg-system: #2c2516;
+                --response-bg: #1e2a3a;
+                --score-high: #60a5fa;
+                --score-med: #fbbf24;
+                --score-low: #ef4444;
+            }
+            #exploration-header {
+                margin-bottom: 1.5rem;
+                padding-bottom: 1rem;
+                border-bottom: 1px solid var(--border-color);
+            }
+            .filter-container {
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                padding: 1rem;
+                margin-bottom: 1.5rem;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);
+            }
+            .navigation-buttons button {
+                min-width: 120px;
+                font-weight: 500;
+            }
+            .content-panel {
+                margin-top: 1.5rem;
+            }
+            @media (max-width: 768px) {
+                .filter-row {
+                    flex-direction: column;
+                }
+            }
+        </style>
+        """
+        )
+        # Header
+        with gr.Row(elem_id="exploration-header"):
+            gr.HTML(HEADER_CONTENT)
+        # Filters section
+        with gr.Column(elem_classes="filter-container"):
+            gr.Markdown("### 🔍 Filter Options")
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                explore_model = gr.Dropdown(
+                    choices=MODELS,
+                    value=MODELS[0],
+                    label="Model",
+                    container=True,
+                    scale=1,
+                    info="Select AI model",
+                )
+                explore_dataset = gr.Dropdown(
+                    choices=DATASETS,
+                    value=DATASETS[0],
+                    label="Dataset",
+                    container=True,
+                    scale=1,
+                    info="Select evaluation dataset",
+                )
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                min_score = gr.Slider(
+                    minimum=float(min(SCORES)),
+                    maximum=float(max(SCORES)),
+                    value=float(min(SCORES)),
+                    step=0.1,
+                    label="Minimum TSQ Score",
+                    container=True,
+                    scale=1,
+                    info="Filter responses with scores above this threshold",
+                )
+                max_score = gr.Slider(
+                    minimum=float(min(SCORES)),
+                    maximum=float(max(SCORES)),
+                    value=float(max(SCORES)),
+                    step=0.1,
+                    label="Maximum TSQ Score",
+                    container=True,
+                    scale=1,
+                    info="Filter responses with scores below this threshold",
+                )
+            # Get the data for initial ranges
+            df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
+            # Ensure columns exist and get ranges
+            n_turns_max = int(df_chat["n_turns"].max())
+            len_query_max = int(df_chat["len_query"].max())
+            n_tools_max = int(df_chat["n_tools"].max())
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                n_turns_filter = gr.Slider(
+                    minimum=0,
+                    maximum=n_turns_max,
+                    value=0,
+                    step=1,
+                    label="Minimum Turn Count",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum number of conversation turns",
+                )
+                len_query_filter = gr.Slider(
+                    minimum=0,
+                    maximum=len_query_max,
+                    value=0,
+                    step=10,
+                    label="Minimum Query Length",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum length of query in characters",
+                )
+                n_tools_filter = gr.Slider(
+                    minimum=0,
+                    maximum=n_tools_max,
+                    value=0,
+                    step=1,
+                    label="Minimum Tool Count",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum number of tools used",
+                )
+            with gr.Row():
+                reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
+        # Navigation row
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                prev_btn = gr.Button(
+                    "← Previous",
+                    size="lg",
+                    variant="secondary",
+                    elem_classes="navigation-buttons",
+                )
+            with gr.Column(scale=1, min_width=100):
+                # Get initial count from default data
+                df_initial = get_chat_and_score_df(MODELS[0], DATASETS[0])
+                initial_count = len(df_initial)
+                index_display = gr.HTML(
+                    value=f"""<div style="
+                        display: flex;
+                        align-items: center;
+                        justify-content: center;
+                        font-weight: 500;
+                        color: var(--primary-text);
+                        background-color: var(--surface-color-alt);
+                        padding: 0.5rem 1rem;
+                        border-radius: 20px;
+                        font-size: 0.9rem;
+                        width: fit-content;
+                        margin: 0 auto;">
+                        <span style="margin-right: 0.5rem;">📄</span>1/{initial_count}
+                    </div>""",
+                    elem_id="index-display",
+                )
+            with gr.Column(scale=1):
+                next_btn = gr.Button(
+                    "Next →",
+                    size="lg",
+                    variant="secondary",
+                    elem_classes="navigation-buttons",
+                )
+        # Content areas
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                chat_display = gr.HTML()
+            with gr.Column(scale=1):
+                metrics_display = gr.HTML()
         with gr.Row():
+            tool_info_display = gr.HTML()
+        # State for tracking current index (simple integer state)
+        current_index = gr.State(value=0)
+        def reset_index():
+            """Reset the current index to 0"""
+            return 0
+        # Add these explicit event handlers for model and dataset changes
+        explore_model.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        explore_dataset.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        min_score.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        max_score.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        n_turns_filter.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        len_query_filter.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        n_tools_filter.change(
+            reset_index,
+            inputs=[],
+            outputs=[current_index],
+        )
+        # Reset filters
+        def reset_filters():
+            return (
+                MODELS[0],
+                DATASETS[0],
+                float(min(SCORES)),
+                float(max(SCORES)),
+                0,  # n_turns
+                0,  # len_query
+                0,  # n_tools
             )
+        reset_btn.click(
+            reset_filters,
+            outputs=[
+                explore_model,
+                explore_dataset,
+                min_score,
+                max_score,
+                n_turns_filter,
+                len_query_filter,
+                n_tools_filter,
+            ],
         )
+        # Connect filter changes
+        # Replace the existing filter connections with this:
+        for control in [
             explore_model,
             explore_dataset,
             min_score,
             n_turns_filter,
             len_query_filter,
             n_tools_filter,
+        ]:
+            control.change(
+                on_filter_change,
+                inputs=[
+                    explore_model,
+                    explore_dataset,
+                    min_score,
+                    max_score,
+                    n_turns_filter,
+                    len_query_filter,
+                    n_tools_filter,
+                ],
+                outputs=[
+                    chat_display,
+                    metrics_display,
+                    tool_info_display,
+                    index_display,
+                ],
+            )
+        # Connect navigation buttons with necessary filter parameters
+        prev_btn.click(
+            navigate_prev,
             inputs=[
+                current_index,
                 explore_model,
                 explore_dataset,
                 min_score,
                 metrics_display,
                 tool_info_display,
                 index_display,
+                current_index,
             ],
         )
+        next_btn.click(
+            navigate_next,
+            inputs=[
+                current_index,
+                explore_model,
+                explore_dataset,
+                min_score,
+                max_score,
+                n_turns_filter,
+                len_query_filter,
+                n_tools_filter,
+            ],
+            outputs=[
+                chat_display,
+                metrics_display,
+                tool_info_display,
+                index_display,
+                current_index,
+            ],
         )
+        def update_slider_ranges(model, dataset):
+            df_chat = get_chat_and_score_df(model, dataset)
+            # Make sure columns are numeric first
+            df_chat["n_turns"] = pd.to_numeric(
+                df_chat["n_turns"], errors="coerce"
+            ).fillna(0)
+            df_chat["len_query"] = pd.to_numeric(
+                df_chat["len_query"], errors="coerce"
+            ).fillna(0)
+            df_chat["n_tools"] = pd.to_numeric(
+                df_chat["n_tools"], errors="coerce"
+            ).fillna(0)
+            # Calculate maximums with safety buffers
+            n_turns_max = max(1, int(df_chat["n_turns"].max()))
+            len_query_max = max(10, int(df_chat["len_query"].max()))
+            n_tools_max = max(1, int(df_chat["n_tools"].max()))
+            # Return updated sliders using gr.update()
+            return (
+                gr.update(maximum=n_turns_max, value=0),
+                gr.update(maximum=len_query_max, value=0),
+                gr.update(maximum=n_tools_max, value=0),
+            )
+        # Connect model and dataset changes to slider range updates
+        explore_model.change(
+            update_slider_ranges,
+            inputs=[explore_model, explore_dataset],
+            outputs=[n_turns_filter, len_query_filter, n_tools_filter],
+        )
+        explore_dataset.change(
+            update_slider_ranges,
+            inputs=[explore_model, explore_dataset],
+            outputs=[n_turns_filter, len_query_filter, n_tools_filter],
         )
+        return [
+            chat_display,
+            metrics_display,
+            tool_info_display,
+            index_display,
+        ]

tabs/leaderboard.py CHANGED Viewed

@@ -1,329 +1,16 @@
 import gradio as gr
 from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
 from utils import (
     get_rank_badge,
     get_score_bar,
     get_type_badge,
 )
-from utils import get_chart_colors
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import plotly.express as px
-from matplotlib.colors import LinearSegmentedColormap
-def get_performance_chart(df, category_name="Overall"):
-    plt.close("all")
-    score_column = "Category Score"
-    # Sort in ascending order (lowest scores at top, highest at bottom) to match the screenshot
-    df_sorted = df.sort_values(score_column, ascending=True)
-    # Create a Plotly figure
-    fig = go.Figure()
-    # Define colors for model types - these match the image exactly
-    color_map = {
-        "Private": "#4a9bf7",  # Blue for closed source
-        "Open source": "#b56ad7",  # Purple for open source
-    }
-    # Add horizontal bars
-    for i, row in df_sorted.iterrows():
-        model_type = row["Model Type"]
-        fig.add_trace(
-            go.Bar(
-                x=[row[score_column]],
-                y=[row["Model"] + "  "],
-                orientation="h",
-                marker=dict(
-                    color=color_map[model_type],
-                    line=dict(width=0),
-                ),
-                text=f"{row[score_column]:.3f}",
-                textposition="outside",
-                textfont=dict(
-                    size=16, color="white", family="Arial, sans-serif"
-                ),  # Improved visibility
-                hoverinfo="text",
-                hovertext=f"{row['Model']}: {row[score_column]:.3f}",
-                showlegend=False,
-                width=0.65,  # Make bars thinner for cleaner appearance
-            )
-        )
-    # Create a custom legend
-    for model_type, color in color_map.items():
-        display_name = "Closed source" if model_type == "Private" else model_type
-        fig.add_trace(
-            go.Bar(
-                x=[None],
-                y=[None],
-                orientation="h",
-                marker=dict(color=color),
-                showlegend=True,
-                name=display_name,
-            )
-        )
-    # Theme colors - will be set by CSS
-    plot_bg = "rgb(25, 28, 38)"  # Default dark theme
-    paper_bg = "rgb(25, 28, 38)"
-    text_color = "white"
-    grid_color = "rgba(150, 150, 150, 0.2)"
-    legend_bg = "rgba(25, 28, 38, 0.7)"
-    # Calculate a generous height based on the number of items
-    # Use a minimum height and a larger per-item height factor
-    min_height = 600
-    height_per_item = 50  # Increased spacing between bars
-    chart_height = max(min_height, len(df_sorted) * height_per_item)
-    fig.update_layout(
-        title=dict(
-            text=f"Ranking - {category_name}",
-            font=dict(size=28, color=text_color),
-            x=0.5,
-            y=0.98,
-            xanchor="center",
-        ),
-        xaxis=dict(
-            title=dict(
-                text="Average Score (Tool Selection Quality)",
-                font=dict(size=16, color=text_color),
-            ),
-            range=[0, 1.05],
-            gridcolor=grid_color,
-            gridwidth=1,
-            tickfont=dict(size=16, color=text_color),
-            zeroline=False,
-            tickformat=".1f",
-            showgrid=True,
-            dtick=0.2,  # Set tick spacing to match image
-        ),
-        yaxis=dict(
-            tickfont=dict(size=16, color=text_color),
-            automargin=True,
-        ),
-        margin=dict(l=30, r=50, t=100, b=80),
-        height=chart_height,
-        autosize=True,  # Enable autosize for responsiveness
-        bargap=0.15,
-        bargroupgap=0.1,
-        barmode="group",
-        legend=dict(
-            title=dict(text="Model Type", font=dict(size=18, color=text_color)),
-            font=dict(size=16, color=text_color),
-            x=0.4,
-            y=-0.15,
-            xanchor="center",
-            yanchor="top",
-            orientation="h",
-            bgcolor=legend_bg,
-        ),
-        plot_bgcolor=plot_bg,
-        paper_bgcolor=paper_bg,
-        font=dict(color=text_color),
-    )
-    # Add grid lines that match the image
-    for x in [0.2, 0.4, 0.6, 0.8]:
-        fig.add_shape(
-            type="line",
-            x0=x,
-            y0=0,
-            x1=x,
-            y1=1,
-            yref="paper",
-            line=dict(color=grid_color, width=1),
-        )
-    return fig
-def get_performance_cost_chart(df, category_name="Overall"):
-    plt.close("all")
-    score_column = "Category Score"
-    # Create a Plotly figure
-    fig = go.Figure()
-    # Define colors for model types
-    color_map = {
-        "Private": "#4a9bf7",  # Blue for closed source
-        "Open source": "#b56ad7",  # Purple for open source
-    }
-    # Dark theme colors
-    plot_bg = "rgb(25, 28, 38)"
-    paper_bg = "rgb(25, 28, 38)"
-    text_color = "white"
-    grid_color = "rgba(150, 150, 150, 0.2)"
-    legend_bg = "rgba(25, 28, 38, 0.7)"
-    # Add scatter points for each model
-    for _, row in df.iterrows():
-        model_type = row["Model Type"]
-        # Add model point
-        fig.add_trace(
-            go.Scatter(
-                x=[row["IO Cost"]],
-                y=[row[score_column] * 100],  # Convert to percentage scale
-                mode="markers",
-                marker=dict(
-                    color=color_map[model_type],
-                    size=15,
-                    line=dict(width=1, color="white"),
-                    opacity=0.9,
-                ),
-                name=row["Model"],
-                text=f"{row['Model']}<br>${row['IO Cost']:.2f}<br>{row[score_column]:.3f}",
-                hoverinfo="text",
-                showlegend=False,
-            )
-        )
-        # Add model label
-        fig.add_trace(
-            go.Scatter(
-                x=[row["IO Cost"]],
-                y=[row[score_column] * 100 + 0.8],
-                mode="text",
-                text=row["Model"],  # + f" (${row['IO Cost']:.2f})",
-                textposition="top center",
-                textfont=dict(color=text_color, size=10),
-                hoverinfo="none",
-                showlegend=False,
-            )
-        )
-    # Create a custom legend
-    for model_type, color in color_map.items():
-        display_name = "Closed source" if model_type == "Private" else model_type
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="markers",
-                marker=dict(color=color, size=10, line=dict(width=1, color="white")),
-                name=display_name,
-            )
-        )
-    # Add performance bands
-    performance_bands = [
-        {
-            "range": [85, 100],
-            "color": "rgba(52, 211, 153, 0.2)",
-            "label": "Reliable Zone",
-        },
-        {"range": [75, 85], "color": "rgba(251, 191, 36, 0.2)", "label": "Good Zone"},
-        {"range": [60, 75], "color": "rgba(239, 68, 68, 0.2)", "label": "Risky Zone"},
-    ]
-    for band in performance_bands:
-        fig.add_trace(
-            go.Scatter(
-                x=[0.05, 100],
-                y=[band["range"][0], band["range"][0]],
-                mode="lines",
-                line=dict(color="rgba(255, 255, 255, 0.3)", width=1, dash="dash"),
-                showlegend=False,
-            )
-        )
-        fig.add_shape(
-            type="rect",
-            x0=0.08,
-            x1=1000,
-            y0=band["range"][0],
-            y1=band["range"][1],
-            fillcolor=band["color"],
-            line=dict(width=0),
-            layer="below",
-        )
-    # Update layout
-    fig.update_layout(
-        title=dict(
-            text=f"Performance vs. Cost - {category_name}",
-            font=dict(size=28, color=text_color),
-            x=0.5,
-            y=0.98,
-            xanchor="center",
-        ),
-        xaxis=dict(
-            title=dict(
-                text="I/O Cost per Million Tokens ($)",
-                font=dict(size=14, color=text_color),
-            ),
-            type="log",
-            range=[-1.2, 2.1],  # log10 scale from 0.08 to 100
-            gridcolor=grid_color,
-            gridwidth=1,
-            tickfont=dict(size=12, color=text_color),
-            zeroline=False,
-            showgrid=True,
-        ),
-        yaxis=dict(
-            title=dict(
-                text="Average Score (Tool Selection Quality)",
-                font=dict(size=14, color=text_color),
-            ),
-            range=[60, 100],
-            gridcolor=grid_color,
-            gridwidth=1,
-            tickfont=dict(size=12, color=text_color),
-            zeroline=False,
-            showgrid=True,
-        ),
-        margin=dict(l=20, r=20, t=80, b=80),  # Increased bottom margin for legend
-        autosize=True,
-        height=900,  # Increased height
-        # width=1600,
-        legend=dict(
-            title=dict(text="Model Type", font=dict(size=14, color=text_color)),
-            font=dict(size=12, color=text_color),
-            x=0.5,
-            y=-0.15,
-            xanchor="center",
-            yanchor="top",
-            orientation="h",
-            bgcolor=legend_bg,
-        ),
-        plot_bgcolor=plot_bg,
-        paper_bgcolor=paper_bg,
-        font=dict(color=text_color),
-        hovermode="closest",
-    )
-    # Add annotations for performance bands
-    for i, band in enumerate(performance_bands):
-        fig.add_annotation(
-            x=1.5,
-            y=(band["range"][0] + band["range"][1]) / 2 + 3,
-            text=band["label"],
-            showarrow=False,
-            font=dict(size=15, color=text_color),
-            xanchor="left",
-            yanchor="middle",
-            xshift=5,
-        )
-    # Keep only dark theme - remove theme detection and switching
-    fig.update_layout(
-        autosize=True,
-    )
-    return fig
 def filter_leaderboard(df, model_type, category, sort_by):
     filtered_df = df.copy()
     if model_type != "All":
@@ -338,14 +25,9 @@ def filter_leaderboard(df, model_type, category, sort_by):
         filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
     filtered_df["Rank"] = range(1, len(filtered_df) + 1)
-    # Get charts
     perf_chart = get_performance_chart(filtered_df, category)
     cost_chart = get_performance_cost_chart(filtered_df, category)
-    # Don't override the chart settings here - this was causing conflicts
-    # The responsiveness is now handled in the chart creation functions
     # Generate styled table HTML
     table_html = f"""
     <style>
@@ -470,240 +152,55 @@ def filter_leaderboard(df, model_type, category, sort_by):
             </tr>
         """
-    table_html += """
-            </tbody>
-        </table>
-    </div>
-    """
     return table_html, perf_chart, cost_chart
 def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
-    chart_container_css = """
-    <style>
-        /* Chart container styling */
-        .chart-container {
-            display: flex;
-            justify-content: center;
-            align-items: center;
-            width: 100%;
-            margin: 20px 0;
-            position: relative;
-            /* Don't fix the height in CSS */
-        }
-        /* Plotly responsive container - use relative width */
-        .js-plotly-plot, .plot-container, .plotly {
-            width: 100% !important;
-            max-width: 1200px !important;
-            margin: 0 auto !important;
-        }
-        /* SVG container - make it fully responsive */
-        .js-plotly-plot .svg-container {
-            width: 100% !important;
-        }
-        /* Dark mode styles */
-        .dark-theme .chart-title {
-            color: white;
-            text-align: center;
-            font-size: 24px;
-            margin-top: 40px;
-            margin-bottom: 15px;
-        }
-        /* Ensure chart text is visible */
-        .js-plotly-plot text {
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
-            font-size: 14px !important;
-        }
-        /* Responsive adjustments */
-        @media (max-width: 768px) {
-            .js-plotly-plot text {
-                font-size: 12px !important;
-            }
-        }
-        /* Apply font styling to non-title text elements */
-        .js-plotly-plot text:not(.gtitle) {
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
-            font-size: 14px !important;
-        }
-        /* Specific styling for chart titles */
-        .js-plotly-plot .gtitle {
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
-            font-size: 28px !important;
-        }
-    </style>
-    """
-    # Start content directly
-    gr.HTML(HEADER_CONTENT + CARDS)
-    gr.HTML(DESCRIPTION_HTML)
-    # Add our custom CSS
-    gr.HTML(chart_container_css)
-    # Filters row
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            model_type = gr.Dropdown(
-                choices=["All"] + df["Model Type"].unique().tolist(),
-                value="All",
-                label="Model Type",
-            )
-        with gr.Column(scale=1):
-            category = gr.Dropdown(
-                choices=list(CATEGORIES.keys()),
-                value=list(CATEGORIES.keys())[0],
-                label="Category",
-            )
-        with gr.Column(scale=1):
-            sort_by = gr.Radio(
-                choices=["Performance", "Cost"],
-                value="Performance",
-                label="Sort by",
             )
-    # Content
-    output = gr.HTML()
-    # Performance chart - don't specify height in HTML
-    with gr.Row():
-        with gr.Column():
-            gr.HTML('<div class="chart-container">')
-            plot1 = gr.Plot(elem_id="plot1")
-            gr.HTML("</div>")
-    # Cost performance chart - don't specify height in HTML
-    with gr.Row():
-        with gr.Column():
-            gr.HTML('<div class="chart-container">')
-            plot2 = gr.Plot(elem_id="plot2")
-            gr.HTML("</div>")
-    gr.HTML(
-        """<div class="note-box">
-            <p style="margin: 0; font-size: 1em;">
-                Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
-            </p>
-        </div>"""
-    )
-    gr.HTML(METHODOLOGY)
-    # Enhanced resize script - improved to be more responsive
-    resize_js = """
-    <script>
-    // Improved function to handle responsive Plotly charts
-    function resizePlots() {
-        // Find all plot containers
-        const plotContainers = document.querySelectorAll('.js-plotly-plot');
-        if (!plotContainers.length) {
-            // If containers aren't ready yet, retry shortly
-            setTimeout(resizePlots, 100);
-            return;
-        }
-        // Get the available width for the container
-        const containerWidth = document.querySelector('.chart-container').offsetWidth;
-        plotContainers.forEach(container => {
-            // Calculate appropriate dimensions based on container width
-            let containerHeight;
-            // Different height calculation based on chart type
-            if (container.id.includes('plot1')) {
-                // Performance chart - use sizing from reference code
-                const barCount = container.querySelectorAll('.bars .point').length || 20; // Default if can't detect
-                // Convert from matplotlib sizing approach: height = max(8, len(df_sorted) * 0.8) in inches * pixels per inch
-                const heightInInches = Math.max(8, barCount * 0.8);
-                containerHeight = heightInInches * 80; // Convert inches to pixels (approx)
-            } else {
-                // Cost chart - use fixed size from reference code (12x8 inches)
-                containerHeight = 640; // 8 inches * 80 pixels per inch
-                // Keep width proportional to container up to max width
-                const maxWidth = 960; // 12 inches * 80 pixels per inch
-                container.style.maxWidth = maxWidth + 'px';
-            }
-            // Apply dimensions
-            container.style.width = '100%';
-            container.style.height = containerHeight + 'px';
-            // Find and resize the SVG elements
-            const svgElements = container.querySelectorAll('svg');
-            svgElements.forEach(svg => {
-                svg.style.width = '100%';
-                svg.style.height = containerHeight + 'px';
-            });
-            // Find the main SVG container and resize it
-            const svgContainer = container.querySelector('.svg-container');
-            if (svgContainer) {
-                svgContainer.style.width = '100%';
-                svgContainer.style.height = containerHeight + 'px';
-            }
-        });
-        // Trigger window resize to make Plotly redraw
-        window.dispatchEvent(new Event('resize'));
-    }
-    // Functions to run when content changes or window resizes
-    function setupResizeHandlers() {
-        // Initial resize
-        resizePlots();
-        // Handle window resize
-        window.addEventListener('resize', function() {
-            resizePlots();
-        });
-        // Set up a mutation observer to detect when plots are added/changed
-        const observer = new MutationObserver(function(mutations) {
-            mutations.forEach(function(mutation) {
-                if (mutation.addedNodes.length ||
-                    mutation.type === 'attributes' &&
-                    mutation.target.classList.contains('js-plotly-plot')) {
-                    resizePlots();
-                }
-            });
-        });
-        // Observe the entire document for changes
-        observer.observe(document.body, {
-            childList: true,
-            subtree: true,
-            attributes: true,
-            attributeFilter: ['style', 'class']
-        });
-    }
-    // Run when DOM is fully loaded
-    if (document.readyState === 'loading') {
-        document.addEventListener('DOMContentLoaded', setupResizeHandlers);
-    } else {
-        setupResizeHandlers();
-    }
-    // Also resize periodically for a bit after initial load to ensure everything renders properly
-    for (let i = 1; i <= 10; i++) {
-        setTimeout(resizePlots, i * 500);
-    }
-    </script>
-    """
-    gr.HTML(resize_js)
-    for input_comp in [model_type, category, sort_by]:
-        input_comp.change(
-            fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
-            inputs=[model_type, category, sort_by],
-            outputs=[output, plot1, plot2],
-        )
-    return output, plot1, plot2

 import gradio as gr
 from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
+from visualization import (
+    get_performance_chart,
+    get_performance_cost_chart,
+)
 from utils import (
     get_rank_badge,
     get_score_bar,
     get_type_badge,
 )
 def filter_leaderboard(df, model_type, category, sort_by):
     filtered_df = df.copy()
     if model_type != "All":
         filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
     filtered_df["Rank"] = range(1, len(filtered_df) + 1)
     perf_chart = get_performance_chart(filtered_df, category)
     cost_chart = get_performance_cost_chart(filtered_df, category)
     # Generate styled table HTML
     table_html = f"""
     <style>
             </tr>
         """
     return table_html, perf_chart, cost_chart
 def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
+    with gr.Tab("Leaderboard"):
+        gr.HTML(HEADER_CONTENT + CARDS)
+        gr.HTML(DESCRIPTION_HTML)
+        # Filters row
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                model_type = gr.Dropdown(
+                    choices=["All"] + df["Model Type"].unique().tolist(),
+                    value="All",
+                    label="Model Type",
+                )
+            with gr.Column(scale=1):
+                category = gr.Dropdown(
+                    choices=list(CATEGORIES.keys()),
+                    value=list(CATEGORIES.keys())[0],
+                    label="Category",
+                )
+            with gr.Column(scale=1):
+                sort_by = gr.Radio(
+                    choices=["Performance", "Cost"],
+                    value="Performance",
+                    label="Sort by",
+                )
+        # Content
+        output = gr.HTML()
+        plot1 = gr.Plot()
+        plot2 = gr.Plot()
+        gr.HTML(
+            """<div class="note-box">
+                <p style="margin: 0; font-size: 1em;">
+                    Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
+                </p>
+            </div>"""
+        )
+        gr.HTML(METHODOLOGY)
+        for input_comp in [model_type, category, sort_by]:
+            input_comp.change(
+                fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
+                inputs=[model_type, category, sort_by],
+                outputs=[output, plot1, plot2],
             )
+        return output, plot1, plot2

tabs/model_comparison.py CHANGED Viewed

@@ -1,96 +1,5 @@
 import gradio as gr
-from utils import get_chart_colors
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import plotly.express as px
-from matplotlib.colors import LinearSegmentedColormap
-def create_radar_plot(df, model_names):
-    datasets = [col for col in df.columns[7:] if col != "IO Cost"]
-    fig = go.Figure()
-    # Dark theme colors - match other charts
-    plot_bg = "rgb(25, 28, 38)"
-    paper_bg = "rgb(25, 28, 38)"
-    text_color = "white"
-    grid_color = "rgba(150, 150, 150, 0.2)"
-    legend_bg = "rgba(25, 28, 38, 0.7)"
-    # Update colors for dark theme - more vibrant with better contrast
-    colors = [
-        "rgba(74, 155, 247, 0.3)",
-        "rgba(181, 106, 215, 0.3)",
-    ]  # Match color_map from other charts
-    line_colors = ["#4a9bf7", "#b56ad7"]  # Match color_map from other charts
-    for idx, model_name in enumerate(model_names):
-        model_data = df[df["Model"] == model_name].iloc[0]
-        values = [model_data[m] for m in datasets]
-        values.append(values[0])
-        datasets_plot = datasets + [datasets[0]]
-        fig.add_trace(
-            go.Scatterpolar(
-                r=values,
-                theta=datasets_plot,
-                fill="toself",
-                fillcolor=colors[idx % len(colors)],
-                line=dict(color=line_colors[idx % len(line_colors)], width=2),
-                name=model_name,
-                text=[f"{val:.3f}" for val in values],
-                textposition="middle right",
-                mode="lines+markers+text",
-                textfont=dict(color=text_color),  # Set text color to match theme
-            )
-        )
-    # Create a more balanced layout optimized for Gradio display
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 1],
-                showline=False,
-                tickfont=dict(size=12, color=text_color),
-                gridcolor=grid_color,
-            ),
-            angularaxis=dict(
-                tickfont=dict(size=13, color=text_color),
-                rotation=90,
-                direction="clockwise",
-                gridcolor=grid_color,
-            ),
-            bgcolor=plot_bg,  # Set polar background color
-        ),
-        showlegend=True,
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=-0.15,
-            xanchor="center",
-            x=0.5,
-            font=dict(size=14, color=text_color),
-            bgcolor=legend_bg,
-        ),
-        title=dict(
-            text="Model Comparison",
-            x=0.5,
-            y=0.98,
-            font=dict(size=24, color=text_color),
-        ),
-        paper_bgcolor=paper_bg,
-        plot_bgcolor=plot_bg,
-        height=700,
-        width=1200,  # Make it perfectly square
-        margin=dict(l=0, r=0, t=80, b=80),  # Remove horizontal margins completely
-        font=dict(color=text_color),
-    )
-    return fig
 def compare_models(df, model_names=None):
@@ -139,29 +48,26 @@ def compare_models(df, model_names=None):
 def create_model_comparison_tab(df, HEADER_CONTENT):
-    # with gr.Tab("Model Comparison"):
-    gr.HTML(HEADER_CONTENT)
-    with gr.Column():
-        # Filters row
-        with gr.Row(equal_height=True):
-            model_selector = gr.Dropdown(
-                choices=df["Model"].unique().tolist(),
-                value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
-                multiselect=True,
-                label="Select Models to Compare",
-            )
-        model_info = gr.HTML()
-        with gr.Row():
-            with gr.Column(scale=1, min_width=800):
-                gr.HTML('<div class="full-width-plot-container" style="width:100%;">')
-                radar_plot = gr.Plot(elem_id="plot", container=False)
-                gr.HTML("</div>")
-    model_selector.change(
-        fn=lambda m: compare_models(df, m),
-        inputs=[model_selector],
-        outputs=[model_info, radar_plot],
-    )
-    return model_info, radar_plot

 import gradio as gr
+from visualization import create_radar_plot
 def compare_models(df, model_names=None):
 def create_model_comparison_tab(df, HEADER_CONTENT):
+    with gr.Tab("Model Comparison"):
+        gr.HTML(HEADER_CONTENT)
+        with gr.Column():
+            # Filters row
+            with gr.Row(equal_height=True):
+                model_selector = gr.Dropdown(
+                    choices=df["Model"].unique().tolist(),
+                    value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
+                    multiselect=True,
+                    label="Select Models to Compare",
+                )
+            # Content
+            model_info = gr.HTML()
+            radar_plot = gr.Plot()
+        model_selector.change(
+            fn=lambda m: compare_models(df, m),
+            inputs=[model_selector],
+            outputs=[model_info, radar_plot],
+        )
+        return model_info, radar_plot

visualization.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from utils import get_chart_colors
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.graph_objects as go
+def setup_matplotlib():
+    matplotlib.use("Agg")
+    plt.close("all")
+def get_performance_chart(df, category_name="Overall"):
+    plt.close("all")
+    colors = get_chart_colors()
+    score_column = "Category Score"
+    df_sorted = df.sort_values(score_column, ascending=True)
+    height = max(8, len(df_sorted) * 0.8)
+    fig, ax = plt.subplots(figsize=(16, height))
+    plt.rcParams.update({"font.size": 12})
+    fig.patch.set_facecolor(colors["background"])
+    ax.set_facecolor(colors["background"])
+    try:
+        bars = ax.barh(
+            np.arange(len(df_sorted)),
+            df_sorted[score_column],
+            height=0.4,
+            capstyle="round",
+            color=[colors[t] for t in df_sorted["Model Type"]],
+        )
+        ax.set_title(
+            f"Model Performance - {category_name}",
+            pad=20,
+            fontsize=20,
+            fontweight="bold",
+            color=colors["text"],
+        )
+        ax.set_xlabel(
+            "Average Score (Tool Selection Quality)",
+            fontsize=14,
+            fontweight="bold",
+            labelpad=10,
+            color=colors["text"],
+        )
+        ax.set_xlim(0.0, 1.0)
+        ax.set_yticks(np.arange(len(df_sorted)))
+        ax.set_yticklabels(
+            df_sorted["Model"], fontsize=12, fontweight="bold", color=colors["text"]
+        )
+        plt.subplots_adjust(left=0.35)
+        for i, v in enumerate(df_sorted[score_column]):
+            ax.text(
+                v + 0.01,
+                i,
+                f"{v:.3f}",
+                va="center",
+                fontsize=12,
+                fontweight="bold",
+                color=colors["text"],
+            )
+        ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
+        ax.spines[["top", "right"]].set_visible(False)
+        ax.spines[["bottom", "left"]].set_color(colors["grid"])
+        ax.tick_params(colors=colors["text"])
+        legend_elements = [
+            plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
+            for label, color in {
+                k: colors[k] for k in ["Private", "Open source"]
+            }.items()
+        ]
+        ax.legend(
+            handles=legend_elements,
+            title="Model Type",
+            loc="lower right",
+            fontsize=12,
+            title_fontsize=14,
+            facecolor=colors["background"],
+            labelcolor=colors["text"],
+        )
+        plt.tight_layout()
+        return fig
+    finally:
+        plt.close(fig)
+def create_radar_plot(df, model_names):
+    datasets = [col for col in df.columns[7:] if col != "IO Cost"]
+    fig = go.Figure()
+    colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
+    line_colors = ["#4F46E5", "#16A34A"]
+    for idx, model_name in enumerate(model_names):
+        model_data = df[df["Model"] == model_name].iloc[0]
+        values = [model_data[m] for m in datasets]
+        values.append(values[0])
+        datasets_plot = datasets + [datasets[0]]
+        fig.add_trace(
+            go.Scatterpolar(
+                r=values,
+                theta=datasets_plot,
+                fill="toself",
+                fillcolor=colors[idx % len(colors)],
+                line=dict(color=line_colors[idx % len(line_colors)], width=2),
+                name=model_name,
+                text=[f"{val:.3f}" for val in values],
+                textposition="middle right",
+                mode="lines+markers+text",
+            )
+        )
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
+            ),
+            angularaxis=dict(
+                tickfont=dict(size=13, family="Arial"),
+                rotation=90,
+                direction="clockwise",
+            ),
+        ),
+        showlegend=True,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.2,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=14),
+        ),
+        title=dict(
+            text="Model Comparison",
+            x=0.5,
+            y=0.95,
+            font=dict(size=24, family="Arial", color="#1F2937"),
+        ),
+        paper_bgcolor="white",
+        plot_bgcolor="white",
+        height=700,
+        width=900,
+        margin=dict(t=100, b=100, l=80, r=80),
+    )
+    return fig
+def get_performance_cost_chart(df, category_name="Overall"):
+    colors = get_chart_colors()
+    fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
+    fig.patch.set_facecolor(colors["background"])
+    ax.set_facecolor(colors["background"])
+    ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])
+    score_column = "Category Score"
+    for _, row in df.iterrows():
+        color = colors[row["Model Type"]]
+        size = 100 if row[score_column] > 0.85 else 80
+        edge_color = (
+            colors["Private"]
+            if row["Model Type"] == "Private"
+            else colors["Open source"]
+        )
+        ax.scatter(
+            row["IO Cost"],
+            row[score_column] * 100,
+            c=color,
+            s=size,
+            alpha=0.9,
+            edgecolor=edge_color,
+            linewidth=1,
+            zorder=5,
+        )
+        bbox_props = dict(
+            boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
+        )
+        ax.annotate(
+            f"{row['Model']}\n(${row['IO Cost']:.2f})",
+            (row["IO Cost"], row[score_column] * 100),
+            xytext=(5, 5),
+            textcoords="offset points",
+            fontsize=8,
+            fontweight="bold",
+            color=colors["text"],
+            bbox=bbox_props,
+            zorder=6,
+        )
+    ax.set_xscale("log")
+    ax.set_xlim(0.08, 1000)
+    ax.set_ylim(60, 100)
+    ax.set_xlabel(
+        "I/O Cost per Million Tokens ($)",
+        fontsize=10,
+        fontweight="bold",
+        labelpad=10,
+        color=colors["text"],
+    )
+    ax.set_ylabel(
+        "Model Performance Score",
+        fontsize=10,
+        fontweight="bold",
+        labelpad=10,
+        color=colors["text"],
+    )
+    legend_elements = [
+        plt.scatter([], [], c=colors[label], label=label, s=80)
+        for label in ["Private", "Open source"]
+    ]
+    ax.legend(
+        handles=legend_elements,
+        loc="upper right",
+        frameon=True,
+        facecolor=colors["background"],
+        edgecolor="none",
+        fontsize=9,
+        labelcolor=colors["text"],
+    )
+    ax.set_title(
+        f"Performance vs. Cost - {category_name}",
+        fontsize=14,
+        pad=15,
+        fontweight="bold",
+        color=colors["text"],
+    )
+    for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
+        ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
+    ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
+    ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
+    ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
+    for spine in ax.spines.values():
+        spine.set_color(colors["grid"])
+    plt.tight_layout()
+    return fig