yaleh commited on
Commit
a291864
·
1 Parent(s): 80da0e3

Unit tests work now.

Browse files
app/gradio_meta_prompt_utils.py CHANGED
@@ -466,6 +466,12 @@ def process_message_with_models(
466
  NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
467
  NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
468
  }
 
 
 
 
 
 
469
  meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
470
  aggressive_exploration=aggressive_exploration,
471
  verbose=config.verbose, logger=logger)
 
466
  NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
467
  NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
468
  }
469
+
470
+ # Bind response_format to llm here
471
+ nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
472
+ for node in nodes_to_bind:
473
+ llms[node] = llms[node].bind(response_format={"type": "json_object"})
474
+
475
  meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
476
  aggressive_exploration=aggressive_exploration,
477
  verbose=config.verbose, logger=logger)
meta_prompt/consts.py CHANGED
@@ -166,85 +166,93 @@ Create a [name], Here's the descriptions [description]. Start with "GPT Descript
166
  ("human", "{user_message}")
167
  ]),
168
  NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
169
- ("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the Expected Output with two different outputs, and decide which one is closer to the Expected Output. When comparing the outputs, ignore the differences which are acceptable or ignorable according to the Acceptance Criteria.
170
-
171
- You output the following analysis according to the Acceptance Criteria:
172
-
173
- * Your analysis in a Markdown list.
174
- * Indicates an output ID that is closer to the Expected Output, in the following format:
175
-
176
- ```
177
- # Analysis
178
-
179
- ...
180
-
181
- # Output ID closer to Expected Output: [ID]
182
- ```
183
-
184
- You must choose one of the two outputs. If both outputs are exactly the same, output the following:
185
-
186
- ```
187
- # Analysis
188
-
189
- ...
190
-
191
- # Draw
192
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  """),
194
- ("human", """
195
- # Output ID: A
196
-
197
- ```
198
- {best_output}
199
- ```
200
-
201
- # Output ID: B
202
-
203
- ```
204
- {output}
205
- ```
206
-
207
- # Acceptance Criteria
208
-
209
- {acceptance_criteria}
210
-
211
- # Expected Output
212
-
213
- ```
214
- {expected_output}
215
- ```
216
  """)
217
  ]),
218
  NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
219
- ("system", """You are a text comparing program. You compare the following output texts, analysis the System Message and provide a detailed analysis according to `Acceptance Criteria`. Then you decide whether `Actual Output` is acceptable.
220
-
221
- Provide your analysis in the following format:
222
-
223
- ```
224
- - Acceptable Differences: [List acceptable differences succinctly]
225
- - Unacceptable Differences: [List unacceptable differences succinctly]
226
- - Accept: [Yes/No]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  ```
228
 
229
- * Compare Expected Output and Actual Output with the guidance of Accept Criteria.
230
- * Only set 'Accept' to 'Yes', if Accept Criteria are all met. Otherwise, set 'Accept' to 'No'.
231
- * List only the acceptable differences according to Accept Criteria in 'acceptable Differences' section.
232
- * List only the unacceptable differences according to Accept Criteria in 'Unacceptable Differences' section.
233
-
234
  # Acceptance Criteria
235
 
236
- ```
237
  {acceptance_criteria}
238
- ```
239
  """),
240
- ("human", """
241
- # System Message
242
-
243
- ```
244
- {system_message}
245
- ```
246
-
247
- # Expected Output
248
 
249
  ```
250
  {expected_output}
 
166
  ("human", "{user_message}")
167
  ]),
168
  NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
169
+ ("system", """{{
170
+ "task_description": "You are a text comparing program. Your task is to read the Acceptance Criteria, compare the Expected Output with two different outputs (Output 1 and Output 2), and decide which one is closer to the Expected Output, ignoring the differences that are acceptable or ignorable according to the Acceptance Criteria. Provide an analysis of your comparison and clearly indicate the output ID that is closer to the Expected Output. Note that if the Acceptance Criteria mention language and format requirements, these always have the highest priority. Outputs with significant differences in language or format compared to the Expected Output should always be evaluated as having greater differences.",
171
+ "requirements": [
172
+ "Read and understand the provided Acceptance Criteria carefully.",
173
+ "Compare the Expected Output with two different outputs (Output 1 and Output 2).",
174
+ "Ignore the differences that are specified as acceptable or ignorable in the Acceptance Criteria.",
175
+ "Determine which output (Output 1 or Output 2) is closer to the Expected Output based on the Acceptance Criteria.",
176
+ "Provide a detailed analysis of your comparison and decision-making process.",
177
+ "Clearly indicate the output ID (either 1 or 2) that is closer to the Expected Output."
178
+ ],
179
+ "output_format": {{
180
+ "type": "object",
181
+ "properties": {{
182
+ "analysis": {{
183
+ "type": "string",
184
+ "description": "A detailed analysis explaining the comparison and decision-making process based on the Acceptance Criteria."
185
+ }},
186
+ "closerOutputID": {{
187
+ "type": "integer",
188
+ "description": "The output ID (1 or 2) that is closer to the Expected Output, or 0 if both outputs are equally close."
189
+ }}
190
+ }},
191
+ "required": [
192
+ "analysis",
193
+ "closerOutputID"
194
+ ]
195
+ }},
196
+ "output_example": {{
197
+ "analysis": "The Acceptance Criteria specified that the output should be in English and follow a specific JSON format. Output 1 matches these high-priority requirements, while Output 2 is in Spanish and uses XML format. Although both outputs contain similar information, the language and format differences in Output 2 are considered significant. Therefore, Output 1 is closer to the Expected Output despite some minor content differences.",
198
+ "closerOutputID": 1
199
+ }},
200
+
201
+ "evaluation_criteria": [
202
+ "The analysis should demonstrate a clear understanding of the Acceptance Criteria, with the highest priority given to language and format requirements if specified.",
203
+ "The comparison should accurately identify and ignore acceptable or ignorable differences, while emphasizing significant language or format discrepancies.",
204
+ "The decision should be based on a thorough analysis of the outputs in relation to the Expected Output, prioritizing language and format matching when required.",
205
+ "The output ID indicated as closer to the Expected Output should align with the analysis, reflecting the importance of language and format requirements."
206
+ ],
207
+ "error_handling": [
208
+ "If the Acceptance Criteria are unclear or contradictory, provide an analysis explaining the ambiguity and suggest possible interpretations.",
209
+ "If neither output is closer to the Expected Output, provide an analysis explaining why and use \"closerOutputID\": 0."
210
+ ],
211
+ "ethical_considerations": [
212
+ "Ensure that the comparison process is unbiased and solely based on the Acceptance Criteria.",
213
+ "Do not introduce personal opinions or preferences into the analysis."
214
+ ],
215
+ "conclusion": "Confirm that your output adheres to the specified language and format, includes a detailed analysis, and clearly indicates the closer output ID based on the Acceptance Criteria."
216
+ }}
217
  """),
218
+ ("human", """<|Start_Output_ID_1|>{best_output}<|End_Output_ID_1|>
219
+ <|Start_Output_ID_2|>{output}<|End_Output_ID_2|>
220
+ <|Start_Acceptance_Criteria|>{acceptance_criteria}<|End_Acceptance_Criteria|>
221
+ <|Start_Expected_Output|>{expected_output}<|End_Expected_Output|>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  """)
223
  ]),
224
  NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
225
+ ("system", """**TASK:** Compare the Expected Output with the Actual Output according to the Acceptance Criteria. Provide a JSON output with your analysis.
226
+
227
+ **Requirements:**
228
+ - Compare Expected and Actual Outputs strictly following the Acceptance Criteria.
229
+ - Set `Accept` to "Yes" only if all criteria are met; otherwise, set it to "No."
230
+ - List acceptable and unacceptable differences based on the criteria.
231
+
232
+ **Output Format:** JSON with:
233
+ - `Accept: (Yes/No)`
234
+ - `Acceptable Differences: []`
235
+ - `Unacceptable Differences: []`
236
+
237
+ **Example Output:**
238
+ ```json
239
+ {{
240
+ "Accept": "No",
241
+ "Acceptable Differences": [
242
+ "Spelling variations: 'colour' vs 'color'"
243
+ ],
244
+ "Unacceptable Differences": [
245
+ "Missing section: 'Conclusion'",
246
+ "Incorrect date format: '2023/10/12' vs '12-10-2023'"
247
+ ]
248
+ }}
249
  ```
250
 
 
 
 
 
 
251
  # Acceptance Criteria
252
 
 
253
  {acceptance_criteria}
 
254
  """),
255
+ ("human", """# Expected Output
 
 
 
 
 
 
 
256
 
257
  ```
258
  {expected_output}
meta_prompt/meta_prompt.py CHANGED
@@ -116,11 +116,6 @@ class MetaPromptGraph:
116
  self.prompt_templates.update(prompts)
117
 
118
  self.aggressive_exploration = aggressive_exploration
119
-
120
- # Bind response_format to llm here
121
- nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
122
- for node in nodes_to_bind:
123
- self.llms[node] = self.llms[node].bind(response_format={"type": "json_object"})
124
 
125
  def _create_acceptance_criteria_workflow(self) -> StateGraph:
126
  """
 
116
  self.prompt_templates.update(prompts)
117
 
118
  self.aggressive_exploration = aggressive_exploration
 
 
 
 
 
119
 
120
  def _create_acceptance_criteria_workflow(self) -> StateGraph:
121
  """
tests/meta_prompt_graph_test.py CHANGED
@@ -7,6 +7,10 @@ from langchain_openai import ChatOpenAI
7
  from meta_prompt import *
8
  from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
9
  from langgraph.graph import END
 
 
 
 
10
 
11
  class TestMetaPromptGraph(unittest.TestCase):
12
  def setUp(self):
@@ -54,12 +58,7 @@ class TestMetaPromptGraph(unittest.TestCase):
54
  llms = {
55
  "output_history_analyzer": MagicMock(
56
  invoke=lambda prompt: MagicMock(
57
- content="""# Analysis
58
-
59
- This analysis compares two outputs to the expected output based on specific
60
- criteria.
61
-
62
- # Output ID closer to Expected Output: B"""
63
  )
64
  )
65
  }
@@ -99,7 +98,7 @@ class TestMetaPromptGraph(unittest.TestCase):
99
  """
100
  llms = {
101
  NODE_PROMPT_ANALYZER: MagicMock(
102
- invoke=lambda prompt: MagicMock(content="Accept: Yes")
103
  )
104
  }
105
  meta_prompt_graph = MetaPromptGraph(llms=llms)
@@ -133,10 +132,20 @@ class TestMetaPromptGraph(unittest.TestCase):
133
  executes it with a given input state. It then verifies that the output
134
  state contains the expected keys and values.
135
  """
136
- model_name = "google/gemma-2-9b-it"
137
- llm = ChatOpenAI(model_name=model_name)
138
 
139
- meta_prompt_graph = MetaPromptGraph(llms=llm)
 
 
 
 
 
 
 
 
 
 
140
  input_state = AgentState(
141
  user_message="How do I reverse a list in Python?",
142
  expected_output="Use the `[::-1]` slicing technique or the "
@@ -161,7 +170,7 @@ class TestMetaPromptGraph(unittest.TestCase):
161
 
162
  user_message = "How can I create a list of numbers in Python?"
163
  messages = [("system", output_state["best_system_message"]), ("human", user_message)]
164
- result = llm.invoke(messages)
165
 
166
  assert hasattr(result, "content"), "The result should have the attribute 'content'"
167
  print(result.content)
@@ -176,10 +185,10 @@ class TestMetaPromptGraph(unittest.TestCase):
176
  state contains the expected keys and values.
177
  """
178
  optimizer_llm = ChatOpenAI(
179
- model_name="deepseek/deepseek-chat", temperature=0.5
180
  )
181
  executor_llm = ChatOpenAI(
182
- model_name="meta-llama/llama-3-8b-instruct", temperature=0.01
183
  )
184
 
185
  llms = {
@@ -188,7 +197,7 @@ class TestMetaPromptGraph(unittest.TestCase):
188
  NODE_PROMPT_DEVELOPER: optimizer_llm,
189
  NODE_PROMPT_EXECUTOR: executor_llm,
190
  NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
191
- NODE_PROMPT_ANALYZER: optimizer_llm,
192
  NODE_PROMPT_SUGGESTER: optimizer_llm,
193
  }
194
 
@@ -236,7 +245,7 @@ class TestMetaPromptGraph(unittest.TestCase):
236
  responses = [
237
  Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
238
  Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
239
- Mock(type="content", content="Accept: Yes"), # NODE_PPROMPT_ANALYZER
240
  ]
241
  llm.invoke = functools.partial(next, iter(responses))
242
 
@@ -270,12 +279,12 @@ class TestMetaPromptGraph(unittest.TestCase):
270
  responses = [
271
  Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
272
  Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
273
- Mock(type="content", content="Accept: No"), # NODE_PPROMPT_ANALYZER
274
  Mock(type="content", content="Try using the `reverse()` method instead."), # NODE_PROMPT_SUGGESTER
275
  Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."), # NODE_PROMPT_DEVELOPER
276
  Mock(type="content", content="Here's one way: `my_list.reverse()`"), # NODE_PROMPT_EXECUTOR
277
- Mock(type="content", content="# Output ID closer to Expected Output: B"), # NODE_OUTPUT_HISTORY_ANALYZER
278
- Mock(type="content", content="Accept: Yes"), # NODE_PPROMPT_ANALYZER
279
  ]
280
  llm.invoke = lambda _: responses.pop(0)
281
 
@@ -303,7 +312,7 @@ class TestMetaPromptGraph(unittest.TestCase):
303
  """
304
 
305
  llms = {
306
- NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name="deepseek/deepseek-chat")
307
  }
308
  meta_prompt_graph = MetaPromptGraph(llms=llms)
309
  workflow = meta_prompt_graph._create_acceptance_criteria_workflow()
 
7
  from meta_prompt import *
8
  from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
9
  from langgraph.graph import END
10
+ import os
11
+ # from dotenv import load_dotenv
12
+
13
+ # load_dotenv()
14
 
15
  class TestMetaPromptGraph(unittest.TestCase):
16
  def setUp(self):
 
58
  llms = {
59
  "output_history_analyzer": MagicMock(
60
  invoke=lambda prompt: MagicMock(
61
+ content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"
 
 
 
 
 
62
  )
63
  )
64
  }
 
98
  """
99
  llms = {
100
  NODE_PROMPT_ANALYZER: MagicMock(
101
+ invoke=lambda prompt: MagicMock(content="{\"Accept\": \"Yes\"}")
102
  )
103
  }
104
  meta_prompt_graph = MetaPromptGraph(llms=llms)
 
132
  executes it with a given input state. It then verifies that the output
133
  state contains the expected keys and values.
134
  """
135
+ model_name = os.getenv("TEST_MODEL_NAME_EXECUTOR")
136
+ raw_llm = ChatOpenAI(model_name=model_name)
137
 
138
+ llms = {
139
+ NODE_PROMPT_INITIAL_DEVELOPER: raw_llm,
140
+ NODE_ACCEPTANCE_CRITERIA_DEVELOPER: raw_llm,
141
+ NODE_PROMPT_DEVELOPER: raw_llm,
142
+ NODE_PROMPT_EXECUTOR: raw_llm,
143
+ NODE_OUTPUT_HISTORY_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
144
+ NODE_PROMPT_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
145
+ NODE_PROMPT_SUGGESTER: raw_llm,
146
+ }
147
+
148
+ meta_prompt_graph = MetaPromptGraph(llms=llms)
149
  input_state = AgentState(
150
  user_message="How do I reverse a list in Python?",
151
  expected_output="Use the `[::-1]` slicing technique or the "
 
170
 
171
  user_message = "How can I create a list of numbers in Python?"
172
  messages = [("system", output_state["best_system_message"]), ("human", user_message)]
173
+ result = raw_llm.invoke(messages)
174
 
175
  assert hasattr(result, "content"), "The result should have the attribute 'content'"
176
  print(result.content)
 
185
  state contains the expected keys and values.
186
  """
187
  optimizer_llm = ChatOpenAI(
188
+ model_name=os.getenv("TEST_MODEL_NAME_OPTIMIZER"), temperature=0.5
189
  )
190
  executor_llm = ChatOpenAI(
191
+ model_name=os.getenv("TEST_MODEL_NAME_EXECUTOR"), temperature=0.01
192
  )
193
 
194
  llms = {
 
197
  NODE_PROMPT_DEVELOPER: optimizer_llm,
198
  NODE_PROMPT_EXECUTOR: executor_llm,
199
  NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
200
+ NODE_PROMPT_ANALYZER: optimizer_llm.bind(response_format={"type": "json_object"}),
201
  NODE_PROMPT_SUGGESTER: optimizer_llm,
202
  }
203
 
 
245
  responses = [
246
  Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
247
  Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
248
+ Mock(type="content", content="{\"Accept\": \"Yes\"}"), # NODE_PPROMPT_ANALYZER
249
  ]
250
  llm.invoke = functools.partial(next, iter(responses))
251
 
 
279
  responses = [
280
  Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
281
  Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
282
+ Mock(type="content", content="{\"Accept\": \"No\"}"), # NODE_PPROMPT_ANALYZER
283
  Mock(type="content", content="Try using the `reverse()` method instead."), # NODE_PROMPT_SUGGESTER
284
  Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."), # NODE_PROMPT_DEVELOPER
285
  Mock(type="content", content="Here's one way: `my_list.reverse()`"), # NODE_PROMPT_EXECUTOR
286
+ Mock(type="content", content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"), # NODE_OUTPUT_HISTORY_ANALYZER
287
+ Mock(type="content", content="{\"Accept\": \"Yes\"}"), # NODE_PPROMPT_ANALYZER
288
  ]
289
  llm.invoke = lambda _: responses.pop(0)
290
 
 
312
  """
313
 
314
  llms = {
315
+ NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name=os.getenv("TEST_MODEL_NAME_ACCEPTANCE_CRITERIA_DEVELOPER"))
316
  }
317
  meta_prompt_graph = MetaPromptGraph(llms=llms)
318
  workflow = meta_prompt_graph._create_acceptance_criteria_workflow()