File size: 6,311 Bytes
ee20f70
10e9b7d
 
95555bb
 
87340ea
b1b09d8
ee20f70
 
87340ea
38a1a77
95555bb
 
ee20f70
 
 
95555bb
ee20f70
38a1a77
b1b09d8
ee20f70
38a1a77
ee20f70
4dcc8e2
cd32eb4
95555bb
ee20f70
38a1a77
b1b09d8
ee20f70
38a1a77
 
4dcc8e2
2d9c7ce
ee20f70
4dcc8e2
15b6891
b1b09d8
ee20f70
4dcc8e2
38a1a77
ee20f70
c3f6914
cd32eb4
ee20f70
80c837c
38a1a77
95555bb
80c837c
95555bb
7e4a06b
95555bb
 
e80aab9
95555bb
 
 
 
ee20f70
31243f4
ee20f70
b1b09d8
ee20f70
31243f4
95555bb
38a1a77
ee20f70
95555bb
38a1a77
95555bb
 
 
 
 
ee20f70
 
80c837c
95555bb
 
ee20f70
95555bb
 
38a1a77
 
95555bb
 
38a1a77
95555bb
 
38a1a77
ee20f70
38a1a77
95555bb
ee20f70
4dcc8e2
ee20f70
 
 
4dcc8e2
ee20f70
38a1a77
ee20f70
 
 
38a1a77
95555bb
ee20f70
38a1a77
 
ee20f70
 
38a1a77
 
95555bb
 
ee20f70
95555bb
ee20f70
95555bb
ee20f70
95555bb
 
 
 
ee20f70
95555bb
ee20f70
95555bb
ee20f70
95555bb
ee20f70
95555bb
ee20f70
 
95555bb
ee20f70
3c4371f
ee20f70
e80aab9
ee20f70
95555bb
 
ee20f70
b1b09d8
ee20f70
 
 
 
 
 
38a1a77
ee20f70
 
 
 
 
4dcc8e2
ee20f70
 
 
95555bb
e25ef11
95555bb
 
ee20f70
 
 
95555bb
 
 
 
e80aab9
 
 
ee20f70
cd32eb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
""" Working Multi-LLM Agent Evaluation Runner"""
import os
import gradio as gr
import requests
import pandas as pd
from langchain_core.messages import HumanMessage

# Import from veryfinal.py
from veryfinal import UnifiedAgnoEnhancedSystem

# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

# --- Working Agent Definition ---
class WorkingMultiLLMAgent:
    """A working multi-LLM agent that actually answers questions"""
    def __init__(self):
        print("Working Multi-LLM Agent initialized.")
        try:
            self.system = UnifiedAgnoEnhancedSystem()
            print("βœ… Working system built successfully.")
        except Exception as e:
            print(f"❌ Error building system: {e}")
            self.system = None

    def __call__(self, question: str) -> str:
        print(f"Processing: {question[:100]}...")
        
        if self.system is None:
            return "Error: System not initialized"
        
        try:
            answer = self.system.process_query(question)
            
            # Validation
            if not answer or answer == question or len(answer.strip()) == 0:
                return "Information not available"
            
            return answer.strip()
                
        except Exception as e:
            return f"Error: {str(e)}"

def run_and_submit_all(profile: gr.OAuthProfile | None):
    """Run evaluation with working agent"""
    space_id = os.getenv("SPACE_ID")
    
    if profile:
        username = f"{profile.username}"
        print(f"User logged in: {username}")
    else:
        print("User not logged in.")
        return "Please Login to Hugging Face with the button.", None

    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"

    # 1. Instantiate Working Agent
    try:
        agent = WorkingMultiLLMAgent()
        if agent.system is None:
            return "Error: Failed to initialize working agent", None
    except Exception as e:
        return f"Error initializing agent: {e}", None
    
    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "No space ID"

    # 2. Fetch Questions
    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
        if not questions_data:
            return "No questions fetched", None
        print(f"βœ… Fetched {len(questions_data)} questions")
    except Exception as e:
        return f"Error fetching questions: {e}", None

    # 3. Process Questions
    results_log = []
    answers_payload = []
    
    for i, item in enumerate(questions_data):
        task_id = item.get("task_id")
        question_text = item.get("question")
        
        if not task_id or question_text is None:
            continue
            
        print(f"Processing {i+1}/{len(questions_data)}: {task_id}")
        
        try:
            answer = agent(question_text)
            
            # Prevent question repetition
            if answer == question_text or answer.startswith(question_text):
                answer = "Information not available"
            
            answers_payload.append({"task_id": task_id, "submitted_answer": answer})
            results_log.append({
                "Task ID": task_id,
                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                "Submitted Answer": answer[:200] + "..." if len(answer) > 200 else answer
            })
        except Exception as e:
            error_msg = f"ERROR: {e}"
            answers_payload.append({"task_id": task_id, "submitted_answer": error_msg})
            results_log.append({
                "Task ID": task_id,
                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                "Submitted Answer": error_msg
            })

    if not answers_payload:
        return "No answers generated", pd.DataFrame(results_log)

    # 4. Submit Results
    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
    
    try:
        response = requests.post(submit_url, json=submission_data, timeout=60)
        response.raise_for_status()
        result_data = response.json()
        
        final_status = (
            f"βœ… Submission Successful!\n"
            f"User: {result_data.get('username')}\n"
            f"Score: {result_data.get('score', 'N/A')}% "
            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
            f"Message: {result_data.get('message', 'Success')}"
        )
        
        return final_status, pd.DataFrame(results_log)
    except Exception as e:
        return f"❌ Submission Failed: {e}", pd.DataFrame(results_log)

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# Working Multi-LLM Agent System")
    gr.Markdown(
        """
        **βœ… This is a WORKING system that will actually answer questions!**
        
        **Features:**
        - **Groq Llama-3 70B**: High-quality responses
        - **Smart Routing**: Math, search, wiki, and general queries
        - **Web Search**: Tavily integration for current information
        - **Wikipedia**: Encyclopedic knowledge access
        - **Robust Error Handling**: Fallbacks and validation
        
        **Instructions:**
        1. Log in with your Hugging Face account
        2. Click 'Run Evaluation & Submit All Answers'
        3. Wait for processing to complete
        4. View your results and score
        
        **Requirements:**
        - GROQ_API_KEY in your environment variables
        - TAVILY_API_KEY (optional, for web search)
        """
    )

    gr.LoginButton()
    run_button = gr.Button("πŸš€ Run Evaluation & Submit All Answers", variant="primary")
    status_output = gr.Textbox(label="Status", lines=5, interactive=False)
    results_table = gr.DataFrame(label="Results", wrap=True)

    run_button.click(
        fn=run_and_submit_all,
        outputs=[status_output, results_table]
    )

if __name__ == "__main__":
    print("πŸš€ Starting Working Multi-LLM Agent System")
    demo.launch(debug=True, share=False)