Presidentlin commited on
Commit
76ed6d2
·
1 Parent(s): 9d92eeb
Files changed (3) hide show
  1. __pycache__/main.cpython-310.pyc +0 -0
  2. app.py +12 -30
  3. main.py +55 -35
__pycache__/main.cpython-310.pyc CHANGED
Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ
 
app.py CHANGED
@@ -106,43 +106,21 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
106
  if not selected_questions:
107
  st.warning("Please select at least one question.")
108
  else:
109
- # Initialize progress bar
110
  progress_bar = st.progress(0)
111
  num_questions = len(selected_questions)
112
  results = []
113
 
114
- # Stop button
115
  stop_button = st.button("Stop Benchmark")
116
 
117
- # Benchmarking loop
118
- for i, question in enumerate(selected_questions):
119
- # Display current question
120
- st.write(f"Processing question {i+1}/{num_questions}: {question}")
 
121
 
122
- # ... (benchmarking logic using the chosen execution mode)
123
- if execution_mode == "Sequential":
124
- question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
125
- else: # Multithreaded
126
- question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
127
-
128
- results.extend(question_results)
129
-
130
- # Update progress bar
131
- progress_bar.progress((i + 1) / num_questions)
132
-
133
- # Check if stop button is clicked
134
- if stop_button:
135
- st.warning("Benchmark stopped!")
136
- break # Exit the loop
137
-
138
- # Display results (even if interrupted)
139
- st.write("Results:")
140
- # ... (table generation logic - Same as before)
141
-
142
- if stop_button:
143
- st.warning("Partial results displayed due to interruption.")
144
- else:
145
- st.success("Benchmark completed!")
146
 
147
  # Display results in a table
148
  st.write("Results:")
@@ -157,6 +135,10 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
157
  })
158
  st.table(results_table)
159
 
 
 
 
 
160
 
161
  else:
162
  st.warning("Please confirm your API keys first.")
 
106
  if not selected_questions:
107
  st.warning("Please select at least one question.")
108
  else:
109
+ # Initialize progress bar
110
  progress_bar = st.progress(0)
111
  num_questions = len(selected_questions)
112
  results = []
113
 
114
+ # Stop button (not implemented yet)
115
  stop_button = st.button("Stop Benchmark")
116
 
117
+ # Benchmarking logic using the chosen execution mode
118
+ if execution_mode == "Sequential":
119
+ question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
120
+ else: # Multithreaded
121
+ question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
122
 
123
+ results.extend(question_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Display results in a table
126
  st.write("Results:")
 
135
  })
136
  st.table(results_table)
137
 
138
+ if stop_button:
139
+ st.warning("Partial results displayed due to interruption.")
140
+ else:
141
+ st.success("Benchmark completed!")
142
 
143
  else:
144
  st.warning("Please confirm your API keys first.")
main.py CHANGED
@@ -7,50 +7,65 @@ import threading
7
  import streamlit as st # Import Streamlit
8
 
9
 
10
- def process_question(question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, total_questions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  start_time = time.time()
12
- st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True) # Display question in red
13
  previous_answers = []
14
  question_novelty = 0
15
 
16
  try:
17
  while True:
18
- gen_prompt = create_gen_prompt(question, previous_answers)
19
- try:
20
- new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
21
- openai_api_key=openai_api_key)
22
- except Exception as e:
23
- st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
24
- unsafe_allow_html=True) # Display error in red
25
  break
26
 
27
- judge_prompt = create_judge_prompt(question, new_answer)
28
- judge = "openai/gpt-4o-mini"
29
- try:
30
- judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
31
- openai_api_key=openai_api_key)
32
- except Exception as e:
33
- st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
34
- unsafe_allow_html=True) # Display error in red
35
  break
36
 
37
- coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
38
-
39
  if coherence_score <= 3:
40
  st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>",
41
- unsafe_allow_html=True) # Display warning in yellow
42
  break
43
 
44
  novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
45
 
46
  if novelty_score < 0.1:
47
  st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>",
48
- unsafe_allow_html=True) # Display warning in yellow
49
  break
50
 
51
  st.write(f"**New Answer:**\n{new_answer}")
52
  st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>",
53
- unsafe_allow_html=True) # Display coherence score in green
54
  st.write(f"**Novelty Score:** {novelty_score}")
55
 
56
  previous_answers.append(new_answer)
@@ -58,19 +73,18 @@ def process_question(question, model_name, open_router_key, openai_api_key, prog
58
 
59
  except Exception as e:
60
  st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>",
61
- unsafe_allow_html=True) # Display error in red
62
 
63
  time_taken = time.time() - start_time
64
  st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>",
65
- unsafe_allow_html=True) # Display novelty score in blue
66
  st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>",
67
- unsafe_allow_html=True) # Display time taken in blue
68
 
69
  # Update progress
70
  with progress_lock:
71
  completed_questions += 1
72
  progress = completed_questions / total_questions
73
- st.progress(progress) # Update the progress bar
74
 
75
  return question_novelty, [
76
  {
@@ -103,12 +117,11 @@ def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
103
  return novelty
104
 
105
 
106
- def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None):
107
  novelty_score = 0
108
  print_lock = threading.Lock() # Lock for thread-safe printing
109
  results = []
110
  completed_questions = 0 # Shared variable to track progress
111
- progress_lock = threading.Lock() # Lock for protecting completed_questions
112
 
113
  # Use max_threads if provided, otherwise default to the number of questions
114
  if max_threads is None:
@@ -118,7 +131,7 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
118
 
119
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
120
  future_to_question = {executor.submit(
121
- process_question, question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, len(questions)): question for question in questions}
122
 
123
  for future in as_completed(future_to_question):
124
  question = future_to_question[future]
@@ -128,25 +141,32 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
128
  with print_lock:
129
  novelty_score += question_novelty
130
  results.extend(question_results)
131
- st.write(f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>", unsafe_allow_html=True)
 
 
132
  except Exception as e:
133
  with print_lock:
134
  st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
135
 
136
- st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
 
137
  return results
138
 
139
 
140
- def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key):
141
  novelty_score = 0
142
  results = []
143
 
144
  for i, question in enumerate(questions):
145
- question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key, threading.Lock(), i, len(questions))
 
146
  novelty_score += question_novelty
147
  results.extend(question_results)
148
- st.write(f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>", unsafe_allow_html=True) # Display progress after each question
 
 
149
 
150
- st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
 
151
 
152
  return results
 
7
  import streamlit as st # Import Streamlit
8
 
9
 
10
+ def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
11
+ """Generates an answer to a question using the specified language model."""
12
+ gen_prompt = create_gen_prompt(question, previous_answers)
13
+ try:
14
+ new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
15
+ openai_api_key=openai_api_key)
16
+ return new_answer
17
+ except Exception as e:
18
+ st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
19
+ unsafe_allow_html=True)
20
+ return None
21
+
22
+
23
+ def evaluate_answer(question, new_answer, open_router_key, openai_api_key):
24
+ """Evaluates the coherence and novelty of an answer."""
25
+ judge_prompt = create_judge_prompt(question, new_answer)
26
+ judge = "openai/gpt-4o-mini"
27
+ try:
28
+ judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
29
+ openai_api_key=openai_api_key)
30
+ coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
31
+ return coherence_score
32
+ except Exception as e:
33
+ st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
34
+ unsafe_allow_html=True)
35
+ return None
36
+
37
+
38
+ def process_question(question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, total_questions, progress):
39
  start_time = time.time()
40
+ st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)
41
  previous_answers = []
42
  question_novelty = 0
43
 
44
  try:
45
  while True:
46
+ new_answer = generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key)
47
+ if new_answer is None:
 
 
 
 
 
48
  break
49
 
50
+ coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key)
51
+ if coherence_score is None:
 
 
 
 
 
 
52
  break
53
 
 
 
54
  if coherence_score <= 3:
55
  st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>",
56
+ unsafe_allow_html=True)
57
  break
58
 
59
  novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
60
 
61
  if novelty_score < 0.1:
62
  st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>",
63
+ unsafe_allow_html=True)
64
  break
65
 
66
  st.write(f"**New Answer:**\n{new_answer}")
67
  st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>",
68
+ unsafe_allow_html=True)
69
  st.write(f"**Novelty Score:** {novelty_score}")
70
 
71
  previous_answers.append(new_answer)
 
73
 
74
  except Exception as e:
75
  st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>",
76
+ unsafe_allow_html=True)
77
 
78
  time_taken = time.time() - start_time
79
  st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>",
80
+ unsafe_allow_html=True)
81
  st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>",
82
+ unsafe_allow_html=True)
83
 
84
  # Update progress
85
  with progress_lock:
86
  completed_questions += 1
87
  progress = completed_questions / total_questions
 
88
 
89
  return question_novelty, [
90
  {
 
117
  return novelty
118
 
119
 
120
+ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None, progress=0, progress_lock=None):
121
  novelty_score = 0
122
  print_lock = threading.Lock() # Lock for thread-safe printing
123
  results = []
124
  completed_questions = 0 # Shared variable to track progress
 
125
 
126
  # Use max_threads if provided, otherwise default to the number of questions
127
  if max_threads is None:
 
131
 
132
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
133
  future_to_question = {executor.submit(
134
+ process_question, question, model_name, open_router_key, openai_api_key, progress_lock, completed_questions, len(questions), progress): question for question in questions}
135
 
136
  for future in as_completed(future_to_question):
137
  question = future_to_question[future]
 
141
  with print_lock:
142
  novelty_score += question_novelty
143
  results.extend(question_results)
144
+ st.write(
145
+ f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
146
+ unsafe_allow_html=True)
147
  except Exception as e:
148
  with print_lock:
149
  st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
150
 
151
+ st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
152
+ unsafe_allow_html=True)
153
  return results
154
 
155
 
156
+ def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, progress=0, progress_lock=None):
157
  novelty_score = 0
158
  results = []
159
 
160
  for i, question in enumerate(questions):
161
+ question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key,
162
+ progress_lock, i, len(questions), progress)
163
  novelty_score += question_novelty
164
  results.extend(question_results)
165
+ st.write(
166
+ f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
167
+ unsafe_allow_html=True) # Display progress after each question
168
 
169
+ st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
170
+ unsafe_allow_html=True)
171
 
172
  return results