Spaces:

Presidentlin
/

Aidan-Bench

Runtime error

App Files Files Community

Presidentlin commited on Aug 12, 2024

Commit

76ed6d2

1 Parent(s): 9d92eeb

x

Browse files

Files changed (3) hide show

__pycache__/main.cpython-310.pyc +0 -0
app.py +12 -30
main.py +55 -35

__pycache__/main.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -106,43 +106,21 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
-        # Initialize progress bar
             progress_bar = st.progress(0)
             num_questions = len(selected_questions)
             results = []
-            # Stop button
             stop_button = st.button("Stop Benchmark")
-            # Benchmarking loop
-            for i, question in enumerate(selected_questions):
-                # Display current question
-                st.write(f"Processing question {i+1}/{num_questions}: {question}")
-                # ... (benchmarking logic using the chosen execution mode)
-                if execution_mode == "Sequential":
-                    question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
-                else:  # Multithreaded
-                    question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
-                results.extend(question_results)
-                # Update progress bar
-                progress_bar.progress((i + 1) / num_questions)
-                # Check if stop button is clicked
-                if stop_button:
-                    st.warning("Benchmark stopped!")
-                    break  # Exit the loop
-            # Display results (even if interrupted)
-            st.write("Results:")
-            # ... (table generation logic - Same as before)
-            if stop_button:
-                st.warning("Partial results displayed due to interruption.")
-            else:
-                st.success("Benchmark completed!")
             # Display results in a table
             st.write("Results:")
@@ -157,6 +135,10 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
                     })
             st.table(results_table)
 else:
     st.warning("Please confirm your API keys first.")

         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
+            # Initialize progress bar
             progress_bar = st.progress(0)
             num_questions = len(selected_questions)
             results = []
+            # Stop button (not implemented yet)
             stop_button = st.button("Stop Benchmark")
+            # Benchmarking logic using the chosen execution mode
+            if execution_mode == "Sequential":
+                question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
+            else:  # Multithreaded
+                question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
+            results.extend(question_results)
             # Display results in a table
             st.write("Results:")
                     })
             st.table(results_table)
+            if stop_button:
+                st.warning("Partial results displayed due to interruption.")
+            else:
+                st.success("Benchmark completed!")
 else:
     st.warning("Please confirm your API keys first.")

main.py CHANGED Viewed

@@ -7,50 +7,65 @@ import threading
 import streamlit as st  # Import Streamlit
-def process_question(question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, total_questions):
     start_time = time.time()
-    st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)  # Display question in red
     previous_answers = []
     question_novelty = 0
     try:
         while True:
-            gen_prompt = create_gen_prompt(question, previous_answers)
-            try:
-                new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
-                                             openai_api_key=openai_api_key)
-            except Exception as e:
-                st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
-                         unsafe_allow_html=True)  # Display error in red
                 break
-            judge_prompt = create_judge_prompt(question, new_answer)
-            judge = "openai/gpt-4o-mini"
-            try:
-                judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
-                                                 openai_api_key=openai_api_key)
-            except Exception as e:
-                st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
-                         unsafe_allow_html=True)  # Display error in red
                 break
-            coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
             if coherence_score <= 3:
                 st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>",
-                         unsafe_allow_html=True)  # Display warning in yellow
                 break
             novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
             if novelty_score < 0.1:
                 st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>",
-                         unsafe_allow_html=True)  # Display warning in yellow
                 break
             st.write(f"**New Answer:**\n{new_answer}")
             st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>",
-                     unsafe_allow_html=True)  # Display coherence score in green
             st.write(f"**Novelty Score:** {novelty_score}")
             previous_answers.append(new_answer)
@@ -58,19 +73,18 @@ def process_question(question, model_name, open_router_key, openai_api_key, prog
     except Exception as e:
         st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>",
-                 unsafe_allow_html=True)  # Display error in red
     time_taken = time.time() - start_time
     st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>",
-             unsafe_allow_html=True)  # Display novelty score in blue
     st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>",
-             unsafe_allow_html=True)  # Display time taken in blue
     # Update progress
     with progress_lock:
         completed_questions += 1
         progress = completed_questions / total_questions
-        st.progress(progress)  # Update the progress bar
     return question_novelty, [
         {
@@ -103,12 +117,11 @@ def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
     return novelty
-def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None):
     novelty_score = 0
     print_lock = threading.Lock()  # Lock for thread-safe printing
     results = []
     completed_questions = 0  # Shared variable to track progress
-    progress_lock = threading.Lock()  # Lock for protecting completed_questions
     # Use max_threads if provided, otherwise default to the number of questions
     if max_threads is None:
@@ -118,7 +131,7 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_question = {executor.submit(
-            process_question, question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, len(questions)): question for question in questions}
         for future in as_completed(future_to_question):
             question = future_to_question[future]
@@ -128,25 +141,32 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
                 with print_lock:
                     novelty_score += question_novelty
                     results.extend(question_results)
-                    st.write(f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>", unsafe_allow_html=True)
             except Exception as e:
                 with print_lock:
                     st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
-    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
     return results
-def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key):
     novelty_score = 0
     results = []
     for i, question in enumerate(questions):
-        question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key, threading.Lock(), i, len(questions))
         novelty_score += question_novelty
         results.extend(question_results)
-        st.write(f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>", unsafe_allow_html=True)  # Display progress after each question
-    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
     return results

 import streamlit as st  # Import Streamlit
+def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
+    """Generates an answer to a question using the specified language model."""
+    gen_prompt = create_gen_prompt(question, previous_answers)
+    try:
+        new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
+                                             openai_api_key=openai_api_key)
+        return new_answer
+    except Exception as e:
+        st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
+                         unsafe_allow_html=True)
+        return None
+def evaluate_answer(question, new_answer, open_router_key, openai_api_key):
+    """Evaluates the coherence and novelty of an answer."""
+    judge_prompt = create_judge_prompt(question, new_answer)
+    judge = "openai/gpt-4o-mini"
+    try:
+        judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
+                                                 openai_api_key=openai_api_key)
+        coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
+        return coherence_score
+    except Exception as e:
+        st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
+                         unsafe_allow_html=True)
+        return None
+def process_question(question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, total_questions, progress):
     start_time = time.time()
+    st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)
     previous_answers = []
     question_novelty = 0
     try:
         while True:
+            new_answer = generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key)
+            if new_answer is None:
                 break
+            coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key)
+            if coherence_score is None:
                 break
             if coherence_score <= 3:
                 st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>",
+                         unsafe_allow_html=True)
                 break
             novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
             if novelty_score < 0.1:
                 st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>",
+                         unsafe_allow_html=True)
                 break
             st.write(f"**New Answer:**\n{new_answer}")
             st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>",
+                     unsafe_allow_html=True)
             st.write(f"**Novelty Score:** {novelty_score}")
             previous_answers.append(new_answer)
     except Exception as e:
         st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>",
+                 unsafe_allow_html=True)
     time_taken = time.time() - start_time
     st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>",
+             unsafe_allow_html=True)
     st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>",
+             unsafe_allow_html=True)
     # Update progress
     with progress_lock:
         completed_questions += 1
         progress = completed_questions / total_questions
     return question_novelty, [
         {
     return novelty
+def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None, progress=0, progress_lock=None):
     novelty_score = 0
     print_lock = threading.Lock()  # Lock for thread-safe printing
     results = []
     completed_questions = 0  # Shared variable to track progress
     # Use max_threads if provided, otherwise default to the number of questions
     if max_threads is None:
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_question = {executor.submit(
+            process_question, question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, len(questions), progress): question for question in questions}
         for future in as_completed(future_to_question):
             question = future_to_question[future]
                 with print_lock:
                     novelty_score += question_novelty
                     results.extend(question_results)
+                    st.write(
+                        f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
+                        unsafe_allow_html=True)
             except Exception as e:
                 with print_lock:
                     st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
+    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
+             unsafe_allow_html=True)
     return results
+def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, progress=0, progress_lock=None):
     novelty_score = 0
     results = []
     for i, question in enumerate(questions):
+        question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key,
+                                                              progress_lock, i, len(questions), progress)
         novelty_score += question_novelty
         results.extend(question_results)
+        st.write(
+            f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
+            unsafe_allow_html=True)  # Display progress after each question
+    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
+             unsafe_allow_html=True)
     return results