Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
91da2cc
1
Parent(s):
c8ff2be
improved dataset table
Browse files- data_loader.py +88 -31
data_loader.py
CHANGED
|
@@ -659,6 +659,64 @@ METHODOLOGY = """
|
|
| 659 |
width: 100%;
|
| 660 |
padding: 2rem 0;
|
| 661 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
|
| 663 |
[Rest of the CSS remains the same]
|
| 664 |
</style>
|
|
@@ -698,64 +756,63 @@ METHODOLOGY = """
|
|
| 698 |
<tbody>
|
| 699 |
<tr>
|
| 700 |
<td rowspan="4">Single-Turn</td>
|
| 701 |
-
<td>
|
| 702 |
-
<td>Single Function Call</td>
|
| 703 |
-
<td>xlam_single_tool_single_call</td>
|
| 704 |
-
<td>
|
| 705 |
</tr>
|
| 706 |
<tr>
|
| 707 |
-
<td>
|
| 708 |
-
<td>Multiple Function Call</td>
|
| 709 |
-
<td>xlam_multiple_tool_multiple_call</td>
|
| 710 |
-
<td>
|
| 711 |
</tr>
|
| 712 |
<tr>
|
| 713 |
<td>100</td>
|
| 714 |
-
<td>Irrelevant Query</td>
|
| 715 |
-
<td>BFCL_v3_irrelevance</td>
|
| 716 |
-
<td>
|
| 717 |
</tr>
|
| 718 |
<tr>
|
| 719 |
<td>100</td>
|
| 720 |
-
<td>Long Context</td>
|
| 721 |
-
<td>tau_long_context</td>
|
| 722 |
-
<td>
|
| 723 |
</tr>
|
| 724 |
<tr>
|
| 725 |
<td rowspan="5">Multi-Turn</td>
|
| 726 |
-
<td>
|
| 727 |
-
<td>Single Function Call</td>
|
| 728 |
-
<td>BFCL_v3_multi_turn_base_single_func_call</td>
|
| 729 |
-
<td>
|
| 730 |
</tr>
|
| 731 |
<tr>
|
| 732 |
<td>50</td>
|
| 733 |
-
<td>Multiple Function Call</td>
|
| 734 |
-
<td>BFCL_v3_multi_turn_base_multi_func_call</td>
|
| 735 |
-
<td>
|
| 736 |
</tr>
|
| 737 |
<tr>
|
| 738 |
<td>100</td>
|
| 739 |
-
<td>Missing Function</td>
|
| 740 |
-
<td>BFCL_v3_multi_turn_miss_func</td>
|
| 741 |
-
<td>
|
| 742 |
</tr>
|
| 743 |
<tr>
|
| 744 |
<td>100</td>
|
| 745 |
-
<td>Missing Parameters</td>
|
| 746 |
-
<td>BFCL_v3_multi_turn_miss_param</td>
|
| 747 |
-
<td>
|
| 748 |
</tr>
|
| 749 |
<tr>
|
| 750 |
<td>100</td>
|
| 751 |
-
<td>Composite</td>
|
| 752 |
-
<td>BFCL_v3_multi_turn_composite</td>
|
| 753 |
-
<td>
|
| 754 |
</tr>
|
| 755 |
</tbody>
|
| 756 |
</table>
|
| 757 |
</div>
|
| 758 |
-
</div>
|
| 759 |
|
| 760 |
<!-- Features Grid Section -->
|
| 761 |
<div class="features-grid">
|
|
|
|
| 659 |
width: 100%;
|
| 660 |
padding: 2rem 0;
|
| 661 |
}
|
| 662 |
+
|
| 663 |
+
.dataset-table {
|
| 664 |
+
width: 100%;
|
| 665 |
+
border-collapse: separate;
|
| 666 |
+
border-spacing: 0;
|
| 667 |
+
margin: 2rem 0;
|
| 668 |
+
background: var(--bg-tertiary);
|
| 669 |
+
border-radius: 1rem;
|
| 670 |
+
overflow: hidden;
|
| 671 |
+
box-shadow: 0 4px 20px var(--shadow-color);
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
.dataset-table thead {
|
| 675 |
+
background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
.dataset-table th {
|
| 679 |
+
padding: 1.25rem 1rem;
|
| 680 |
+
text-align: left;
|
| 681 |
+
color: white;
|
| 682 |
+
font-weight: 600;
|
| 683 |
+
font-size: 1rem;
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
.dataset-table td {
|
| 687 |
+
padding: 1rem;
|
| 688 |
+
border-bottom: 1px solid var(--border-primary);
|
| 689 |
+
color: var(--text-secondary);
|
| 690 |
+
transition: all 0.2s ease;
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
.dataset-table tbody tr:hover td {
|
| 694 |
+
background: var(--card-hover-bg);
|
| 695 |
+
color: var(--text-primary);
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
.dataset-table td[rowspan] {
|
| 699 |
+
background: var(--bg-secondary);
|
| 700 |
+
color: var(--accent-blue);
|
| 701 |
+
font-weight: 600;
|
| 702 |
+
border-right: 1px solid var(--border-primary);
|
| 703 |
+
}
|
| 704 |
+
|
| 705 |
+
.purpose-cell {
|
| 706 |
+
max-width: 300px;
|
| 707 |
+
line-height: 1.5;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
.category-cell {
|
| 711 |
+
color: var(--accent-purple);
|
| 712 |
+
font-weight: 500;
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
.dataset-name {
|
| 716 |
+
font-family: monospace;
|
| 717 |
+
color: var(--accent-pink);
|
| 718 |
+
font-size: 0.9rem;
|
| 719 |
+
}
|
| 720 |
|
| 721 |
[Rest of the CSS remains the same]
|
| 722 |
</style>
|
|
|
|
| 756 |
<tbody>
|
| 757 |
<tr>
|
| 758 |
<td rowspan="4">Single-Turn</td>
|
| 759 |
+
<td>100 + 100</td>
|
| 760 |
+
<td class="category-cell">Single Function Call</td>
|
| 761 |
+
<td class="dataset-name">xlam_single_tool_single_call</td>
|
| 762 |
+
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
|
| 763 |
</tr>
|
| 764 |
<tr>
|
| 765 |
+
<td>200 + 50</td>
|
| 766 |
+
<td class="category-cell">Multiple Function Call</td>
|
| 767 |
+
<td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td>
|
| 768 |
+
<td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td>
|
| 769 |
</tr>
|
| 770 |
<tr>
|
| 771 |
<td>100</td>
|
| 772 |
+
<td class="category-cell">Irrelevant Query</td>
|
| 773 |
+
<td class="dataset-name">BFCL_v3_irrelevance</td>
|
| 774 |
+
<td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td>
|
| 775 |
</tr>
|
| 776 |
<tr>
|
| 777 |
<td>100</td>
|
| 778 |
+
<td class="category-cell">Long Context</td>
|
| 779 |
+
<td class="dataset-name">tau_long_context</td>
|
| 780 |
+
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
|
| 781 |
</tr>
|
| 782 |
<tr>
|
| 783 |
<td rowspan="5">Multi-Turn</td>
|
| 784 |
+
<td>50 + 30</td>
|
| 785 |
+
<td class="category-cell">Single Function Call</td>
|
| 786 |
+
<td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td>
|
| 787 |
+
<td class="purpose-cell">Tests basic conversational function calling abilities</td>
|
| 788 |
</tr>
|
| 789 |
<tr>
|
| 790 |
<td>50</td>
|
| 791 |
+
<td class="category-cell">Multiple Function Call</td>
|
| 792 |
+
<td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td>
|
| 793 |
+
<td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td>
|
| 794 |
</tr>
|
| 795 |
<tr>
|
| 796 |
<td>100</td>
|
| 797 |
+
<td class="category-cell">Missing Function</td>
|
| 798 |
+
<td class="dataset-name">BFCL_v3_multi_turn_miss_func</td>
|
| 799 |
+
<td class="purpose-cell">Tests graceful handling of unavailable tools</td>
|
| 800 |
</tr>
|
| 801 |
<tr>
|
| 802 |
<td>100</td>
|
| 803 |
+
<td class="category-cell">Missing Parameters</td>
|
| 804 |
+
<td class="dataset-name">BFCL_v3_multi_turn_miss_param</td>
|
| 805 |
+
<td class="purpose-cell">Assesses parameter collection and handling incomplete information</td>
|
| 806 |
</tr>
|
| 807 |
<tr>
|
| 808 |
<td>100</td>
|
| 809 |
+
<td class="category-cell">Composite</td>
|
| 810 |
+
<td class="dataset-name">BFCL_v3_multi_turn_composite</td>
|
| 811 |
+
<td class="purpose-cell">Tests overall robustness in complex scenarios</td>
|
| 812 |
</tr>
|
| 813 |
</tbody>
|
| 814 |
</table>
|
| 815 |
</div>
|
|
|
|
| 816 |
|
| 817 |
<!-- Features Grid Section -->
|
| 818 |
<div class="features-grid">
|