Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
91da2cc
1
Parent(s):
c8ff2be
improved dataset table
Browse files- data_loader.py +88 -31
data_loader.py
CHANGED
@@ -659,6 +659,64 @@ METHODOLOGY = """
|
|
659 |
width: 100%;
|
660 |
padding: 2rem 0;
|
661 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
662 |
|
663 |
[Rest of the CSS remains the same]
|
664 |
</style>
|
@@ -698,64 +756,63 @@ METHODOLOGY = """
|
|
698 |
<tbody>
|
699 |
<tr>
|
700 |
<td rowspan="4">Single-Turn</td>
|
701 |
-
<td>
|
702 |
-
<td>Single Function Call</td>
|
703 |
-
<td>xlam_single_tool_single_call</td>
|
704 |
-
<td>
|
705 |
</tr>
|
706 |
<tr>
|
707 |
-
<td>
|
708 |
-
<td>Multiple Function Call</td>
|
709 |
-
<td>xlam_multiple_tool_multiple_call</td>
|
710 |
-
<td>
|
711 |
</tr>
|
712 |
<tr>
|
713 |
<td>100</td>
|
714 |
-
<td>Irrelevant Query</td>
|
715 |
-
<td>BFCL_v3_irrelevance</td>
|
716 |
-
<td>
|
717 |
</tr>
|
718 |
<tr>
|
719 |
<td>100</td>
|
720 |
-
<td>Long Context</td>
|
721 |
-
<td>tau_long_context</td>
|
722 |
-
<td>
|
723 |
</tr>
|
724 |
<tr>
|
725 |
<td rowspan="5">Multi-Turn</td>
|
726 |
-
<td>
|
727 |
-
<td>Single Function Call</td>
|
728 |
-
<td>BFCL_v3_multi_turn_base_single_func_call</td>
|
729 |
-
<td>
|
730 |
</tr>
|
731 |
<tr>
|
732 |
<td>50</td>
|
733 |
-
<td>Multiple Function Call</td>
|
734 |
-
<td>BFCL_v3_multi_turn_base_multi_func_call</td>
|
735 |
-
<td>
|
736 |
</tr>
|
737 |
<tr>
|
738 |
<td>100</td>
|
739 |
-
<td>Missing Function</td>
|
740 |
-
<td>BFCL_v3_multi_turn_miss_func</td>
|
741 |
-
<td>
|
742 |
</tr>
|
743 |
<tr>
|
744 |
<td>100</td>
|
745 |
-
<td>Missing Parameters</td>
|
746 |
-
<td>BFCL_v3_multi_turn_miss_param</td>
|
747 |
-
<td>
|
748 |
</tr>
|
749 |
<tr>
|
750 |
<td>100</td>
|
751 |
-
<td>Composite</td>
|
752 |
-
<td>BFCL_v3_multi_turn_composite</td>
|
753 |
-
<td>
|
754 |
</tr>
|
755 |
</tbody>
|
756 |
</table>
|
757 |
</div>
|
758 |
-
</div>
|
759 |
|
760 |
<!-- Features Grid Section -->
|
761 |
<div class="features-grid">
|
|
|
659 |
width: 100%;
|
660 |
padding: 2rem 0;
|
661 |
}
|
662 |
+
|
663 |
+
.dataset-table {
|
664 |
+
width: 100%;
|
665 |
+
border-collapse: separate;
|
666 |
+
border-spacing: 0;
|
667 |
+
margin: 2rem 0;
|
668 |
+
background: var(--bg-tertiary);
|
669 |
+
border-radius: 1rem;
|
670 |
+
overflow: hidden;
|
671 |
+
box-shadow: 0 4px 20px var(--shadow-color);
|
672 |
+
}
|
673 |
+
|
674 |
+
.dataset-table thead {
|
675 |
+
background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
|
676 |
+
}
|
677 |
+
|
678 |
+
.dataset-table th {
|
679 |
+
padding: 1.25rem 1rem;
|
680 |
+
text-align: left;
|
681 |
+
color: white;
|
682 |
+
font-weight: 600;
|
683 |
+
font-size: 1rem;
|
684 |
+
}
|
685 |
+
|
686 |
+
.dataset-table td {
|
687 |
+
padding: 1rem;
|
688 |
+
border-bottom: 1px solid var(--border-primary);
|
689 |
+
color: var(--text-secondary);
|
690 |
+
transition: all 0.2s ease;
|
691 |
+
}
|
692 |
+
|
693 |
+
.dataset-table tbody tr:hover td {
|
694 |
+
background: var(--card-hover-bg);
|
695 |
+
color: var(--text-primary);
|
696 |
+
}
|
697 |
+
|
698 |
+
.dataset-table td[rowspan] {
|
699 |
+
background: var(--bg-secondary);
|
700 |
+
color: var(--accent-blue);
|
701 |
+
font-weight: 600;
|
702 |
+
border-right: 1px solid var(--border-primary);
|
703 |
+
}
|
704 |
+
|
705 |
+
.purpose-cell {
|
706 |
+
max-width: 300px;
|
707 |
+
line-height: 1.5;
|
708 |
+
}
|
709 |
+
|
710 |
+
.category-cell {
|
711 |
+
color: var(--accent-purple);
|
712 |
+
font-weight: 500;
|
713 |
+
}
|
714 |
+
|
715 |
+
.dataset-name {
|
716 |
+
font-family: monospace;
|
717 |
+
color: var(--accent-pink);
|
718 |
+
font-size: 0.9rem;
|
719 |
+
}
|
720 |
|
721 |
[Rest of the CSS remains the same]
|
722 |
</style>
|
|
|
756 |
<tbody>
|
757 |
<tr>
|
758 |
<td rowspan="4">Single-Turn</td>
|
759 |
+
<td>100 + 100</td>
|
760 |
+
<td class="category-cell">Single Function Call</td>
|
761 |
+
<td class="dataset-name">xlam_single_tool_single_call</td>
|
762 |
+
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
|
763 |
</tr>
|
764 |
<tr>
|
765 |
+
<td>200 + 50</td>
|
766 |
+
<td class="category-cell">Multiple Function Call</td>
|
767 |
+
<td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td>
|
768 |
+
<td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td>
|
769 |
</tr>
|
770 |
<tr>
|
771 |
<td>100</td>
|
772 |
+
<td class="category-cell">Irrelevant Query</td>
|
773 |
+
<td class="dataset-name">BFCL_v3_irrelevance</td>
|
774 |
+
<td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td>
|
775 |
</tr>
|
776 |
<tr>
|
777 |
<td>100</td>
|
778 |
+
<td class="category-cell">Long Context</td>
|
779 |
+
<td class="dataset-name">tau_long_context</td>
|
780 |
+
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
|
781 |
</tr>
|
782 |
<tr>
|
783 |
<td rowspan="5">Multi-Turn</td>
|
784 |
+
<td>50 + 30</td>
|
785 |
+
<td class="category-cell">Single Function Call</td>
|
786 |
+
<td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td>
|
787 |
+
<td class="purpose-cell">Tests basic conversational function calling abilities</td>
|
788 |
</tr>
|
789 |
<tr>
|
790 |
<td>50</td>
|
791 |
+
<td class="category-cell">Multiple Function Call</td>
|
792 |
+
<td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td>
|
793 |
+
<td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td>
|
794 |
</tr>
|
795 |
<tr>
|
796 |
<td>100</td>
|
797 |
+
<td class="category-cell">Missing Function</td>
|
798 |
+
<td class="dataset-name">BFCL_v3_multi_turn_miss_func</td>
|
799 |
+
<td class="purpose-cell">Tests graceful handling of unavailable tools</td>
|
800 |
</tr>
|
801 |
<tr>
|
802 |
<td>100</td>
|
803 |
+
<td class="category-cell">Missing Parameters</td>
|
804 |
+
<td class="dataset-name">BFCL_v3_multi_turn_miss_param</td>
|
805 |
+
<td class="purpose-cell">Assesses parameter collection and handling incomplete information</td>
|
806 |
</tr>
|
807 |
<tr>
|
808 |
<td>100</td>
|
809 |
+
<td class="category-cell">Composite</td>
|
810 |
+
<td class="dataset-name">BFCL_v3_multi_turn_composite</td>
|
811 |
+
<td class="purpose-cell">Tests overall robustness in complex scenarios</td>
|
812 |
</tr>
|
813 |
</tbody>
|
814 |
</table>
|
815 |
</div>
|
|
|
816 |
|
817 |
<!-- Features Grid Section -->
|
818 |
<div class="features-grid">
|