Spaces:
Running
Running
Unit tests work now.
Browse files- app/gradio_meta_prompt_utils.py +6 -0
- meta_prompt/consts.py +77 -69
- meta_prompt/meta_prompt.py +0 -5
- tests/meta_prompt_graph_test.py +28 -19
app/gradio_meta_prompt_utils.py
CHANGED
@@ -466,6 +466,12 @@ def process_message_with_models(
|
|
466 |
NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
|
467 |
NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
|
468 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
|
470 |
aggressive_exploration=aggressive_exploration,
|
471 |
verbose=config.verbose, logger=logger)
|
|
|
466 |
NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
|
467 |
NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
|
468 |
}
|
469 |
+
|
470 |
+
# Bind response_format to llm here
|
471 |
+
nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
|
472 |
+
for node in nodes_to_bind:
|
473 |
+
llms[node] = llms[node].bind(response_format={"type": "json_object"})
|
474 |
+
|
475 |
meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
|
476 |
aggressive_exploration=aggressive_exploration,
|
477 |
verbose=config.verbose, logger=logger)
|
meta_prompt/consts.py
CHANGED
@@ -166,85 +166,93 @@ Create a [name], Here's the descriptions [description]. Start with "GPT Descript
|
|
166 |
("human", "{user_message}")
|
167 |
]),
|
168 |
NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
|
169 |
-
("system", """
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
"""),
|
194 |
-
("human", """
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
{best_output}
|
199 |
-
```
|
200 |
-
|
201 |
-
# Output ID: B
|
202 |
-
|
203 |
-
```
|
204 |
-
{output}
|
205 |
-
```
|
206 |
-
|
207 |
-
# Acceptance Criteria
|
208 |
-
|
209 |
-
{acceptance_criteria}
|
210 |
-
|
211 |
-
# Expected Output
|
212 |
-
|
213 |
-
```
|
214 |
-
{expected_output}
|
215 |
-
```
|
216 |
""")
|
217 |
]),
|
218 |
NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
|
219 |
-
("system", """
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
```
|
228 |
|
229 |
-
* Compare Expected Output and Actual Output with the guidance of Accept Criteria.
|
230 |
-
* Only set 'Accept' to 'Yes', if Accept Criteria are all met. Otherwise, set 'Accept' to 'No'.
|
231 |
-
* List only the acceptable differences according to Accept Criteria in 'acceptable Differences' section.
|
232 |
-
* List only the unacceptable differences according to Accept Criteria in 'Unacceptable Differences' section.
|
233 |
-
|
234 |
# Acceptance Criteria
|
235 |
|
236 |
-
```
|
237 |
{acceptance_criteria}
|
238 |
-
```
|
239 |
"""),
|
240 |
-
("human", """
|
241 |
-
# System Message
|
242 |
-
|
243 |
-
```
|
244 |
-
{system_message}
|
245 |
-
```
|
246 |
-
|
247 |
-
# Expected Output
|
248 |
|
249 |
```
|
250 |
{expected_output}
|
|
|
166 |
("human", "{user_message}")
|
167 |
]),
|
168 |
NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
|
169 |
+
("system", """{{
|
170 |
+
"task_description": "You are a text comparing program. Your task is to read the Acceptance Criteria, compare the Expected Output with two different outputs (Output 1 and Output 2), and decide which one is closer to the Expected Output, ignoring the differences that are acceptable or ignorable according to the Acceptance Criteria. Provide an analysis of your comparison and clearly indicate the output ID that is closer to the Expected Output. Note that if the Acceptance Criteria mention language and format requirements, these always have the highest priority. Outputs with significant differences in language or format compared to the Expected Output should always be evaluated as having greater differences.",
|
171 |
+
"requirements": [
|
172 |
+
"Read and understand the provided Acceptance Criteria carefully.",
|
173 |
+
"Compare the Expected Output with two different outputs (Output 1 and Output 2).",
|
174 |
+
"Ignore the differences that are specified as acceptable or ignorable in the Acceptance Criteria.",
|
175 |
+
"Determine which output (Output 1 or Output 2) is closer to the Expected Output based on the Acceptance Criteria.",
|
176 |
+
"Provide a detailed analysis of your comparison and decision-making process.",
|
177 |
+
"Clearly indicate the output ID (either 1 or 2) that is closer to the Expected Output."
|
178 |
+
],
|
179 |
+
"output_format": {{
|
180 |
+
"type": "object",
|
181 |
+
"properties": {{
|
182 |
+
"analysis": {{
|
183 |
+
"type": "string",
|
184 |
+
"description": "A detailed analysis explaining the comparison and decision-making process based on the Acceptance Criteria."
|
185 |
+
}},
|
186 |
+
"closerOutputID": {{
|
187 |
+
"type": "integer",
|
188 |
+
"description": "The output ID (1 or 2) that is closer to the Expected Output, or 0 if both outputs are equally close."
|
189 |
+
}}
|
190 |
+
}},
|
191 |
+
"required": [
|
192 |
+
"analysis",
|
193 |
+
"closerOutputID"
|
194 |
+
]
|
195 |
+
}},
|
196 |
+
"output_example": {{
|
197 |
+
"analysis": "The Acceptance Criteria specified that the output should be in English and follow a specific JSON format. Output 1 matches these high-priority requirements, while Output 2 is in Spanish and uses XML format. Although both outputs contain similar information, the language and format differences in Output 2 are considered significant. Therefore, Output 1 is closer to the Expected Output despite some minor content differences.",
|
198 |
+
"closerOutputID": 1
|
199 |
+
}},
|
200 |
+
|
201 |
+
"evaluation_criteria": [
|
202 |
+
"The analysis should demonstrate a clear understanding of the Acceptance Criteria, with the highest priority given to language and format requirements if specified.",
|
203 |
+
"The comparison should accurately identify and ignore acceptable or ignorable differences, while emphasizing significant language or format discrepancies.",
|
204 |
+
"The decision should be based on a thorough analysis of the outputs in relation to the Expected Output, prioritizing language and format matching when required.",
|
205 |
+
"The output ID indicated as closer to the Expected Output should align with the analysis, reflecting the importance of language and format requirements."
|
206 |
+
],
|
207 |
+
"error_handling": [
|
208 |
+
"If the Acceptance Criteria are unclear or contradictory, provide an analysis explaining the ambiguity and suggest possible interpretations.",
|
209 |
+
"If neither output is closer to the Expected Output, provide an analysis explaining why and use \"closerOutputID\": 0."
|
210 |
+
],
|
211 |
+
"ethical_considerations": [
|
212 |
+
"Ensure that the comparison process is unbiased and solely based on the Acceptance Criteria.",
|
213 |
+
"Do not introduce personal opinions or preferences into the analysis."
|
214 |
+
],
|
215 |
+
"conclusion": "Confirm that your output adheres to the specified language and format, includes a detailed analysis, and clearly indicates the closer output ID based on the Acceptance Criteria."
|
216 |
+
}}
|
217 |
"""),
|
218 |
+
("human", """<|Start_Output_ID_1|>{best_output}<|End_Output_ID_1|>
|
219 |
+
<|Start_Output_ID_2|>{output}<|End_Output_ID_2|>
|
220 |
+
<|Start_Acceptance_Criteria|>{acceptance_criteria}<|End_Acceptance_Criteria|>
|
221 |
+
<|Start_Expected_Output|>{expected_output}<|End_Expected_Output|>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
""")
|
223 |
]),
|
224 |
NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
|
225 |
+
("system", """**TASK:** Compare the Expected Output with the Actual Output according to the Acceptance Criteria. Provide a JSON output with your analysis.
|
226 |
+
|
227 |
+
**Requirements:**
|
228 |
+
- Compare Expected and Actual Outputs strictly following the Acceptance Criteria.
|
229 |
+
- Set `Accept` to "Yes" only if all criteria are met; otherwise, set it to "No."
|
230 |
+
- List acceptable and unacceptable differences based on the criteria.
|
231 |
+
|
232 |
+
**Output Format:** JSON with:
|
233 |
+
- `Accept: (Yes/No)`
|
234 |
+
- `Acceptable Differences: []`
|
235 |
+
- `Unacceptable Differences: []`
|
236 |
+
|
237 |
+
**Example Output:**
|
238 |
+
```json
|
239 |
+
{{
|
240 |
+
"Accept": "No",
|
241 |
+
"Acceptable Differences": [
|
242 |
+
"Spelling variations: 'colour' vs 'color'"
|
243 |
+
],
|
244 |
+
"Unacceptable Differences": [
|
245 |
+
"Missing section: 'Conclusion'",
|
246 |
+
"Incorrect date format: '2023/10/12' vs '12-10-2023'"
|
247 |
+
]
|
248 |
+
}}
|
249 |
```
|
250 |
|
|
|
|
|
|
|
|
|
|
|
251 |
# Acceptance Criteria
|
252 |
|
|
|
253 |
{acceptance_criteria}
|
|
|
254 |
"""),
|
255 |
+
("human", """# Expected Output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
```
|
258 |
{expected_output}
|
meta_prompt/meta_prompt.py
CHANGED
@@ -116,11 +116,6 @@ class MetaPromptGraph:
|
|
116 |
self.prompt_templates.update(prompts)
|
117 |
|
118 |
self.aggressive_exploration = aggressive_exploration
|
119 |
-
|
120 |
-
# Bind response_format to llm here
|
121 |
-
nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
|
122 |
-
for node in nodes_to_bind:
|
123 |
-
self.llms[node] = self.llms[node].bind(response_format={"type": "json_object"})
|
124 |
|
125 |
def _create_acceptance_criteria_workflow(self) -> StateGraph:
|
126 |
"""
|
|
|
116 |
self.prompt_templates.update(prompts)
|
117 |
|
118 |
self.aggressive_exploration = aggressive_exploration
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
def _create_acceptance_criteria_workflow(self) -> StateGraph:
|
121 |
"""
|
tests/meta_prompt_graph_test.py
CHANGED
@@ -7,6 +7,10 @@ from langchain_openai import ChatOpenAI
|
|
7 |
from meta_prompt import *
|
8 |
from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
|
9 |
from langgraph.graph import END
|
|
|
|
|
|
|
|
|
10 |
|
11 |
class TestMetaPromptGraph(unittest.TestCase):
|
12 |
def setUp(self):
|
@@ -54,12 +58,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
54 |
llms = {
|
55 |
"output_history_analyzer": MagicMock(
|
56 |
invoke=lambda prompt: MagicMock(
|
57 |
-
content="""
|
58 |
-
|
59 |
-
This analysis compares two outputs to the expected output based on specific
|
60 |
-
criteria.
|
61 |
-
|
62 |
-
# Output ID closer to Expected Output: B"""
|
63 |
)
|
64 |
)
|
65 |
}
|
@@ -99,7 +98,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
99 |
"""
|
100 |
llms = {
|
101 |
NODE_PROMPT_ANALYZER: MagicMock(
|
102 |
-
invoke=lambda prompt: MagicMock(content="Accept: Yes")
|
103 |
)
|
104 |
}
|
105 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
@@ -133,10 +132,20 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
133 |
executes it with a given input state. It then verifies that the output
|
134 |
state contains the expected keys and values.
|
135 |
"""
|
136 |
-
model_name = "
|
137 |
-
|
138 |
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
input_state = AgentState(
|
141 |
user_message="How do I reverse a list in Python?",
|
142 |
expected_output="Use the `[::-1]` slicing technique or the "
|
@@ -161,7 +170,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
161 |
|
162 |
user_message = "How can I create a list of numbers in Python?"
|
163 |
messages = [("system", output_state["best_system_message"]), ("human", user_message)]
|
164 |
-
result =
|
165 |
|
166 |
assert hasattr(result, "content"), "The result should have the attribute 'content'"
|
167 |
print(result.content)
|
@@ -176,10 +185,10 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
176 |
state contains the expected keys and values.
|
177 |
"""
|
178 |
optimizer_llm = ChatOpenAI(
|
179 |
-
model_name="
|
180 |
)
|
181 |
executor_llm = ChatOpenAI(
|
182 |
-
model_name="
|
183 |
)
|
184 |
|
185 |
llms = {
|
@@ -188,7 +197,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
188 |
NODE_PROMPT_DEVELOPER: optimizer_llm,
|
189 |
NODE_PROMPT_EXECUTOR: executor_llm,
|
190 |
NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
|
191 |
-
NODE_PROMPT_ANALYZER: optimizer_llm,
|
192 |
NODE_PROMPT_SUGGESTER: optimizer_llm,
|
193 |
}
|
194 |
|
@@ -236,7 +245,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
236 |
responses = [
|
237 |
Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
|
238 |
Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
|
239 |
-
Mock(type="content", content="Accept: Yes"), # NODE_PPROMPT_ANALYZER
|
240 |
]
|
241 |
llm.invoke = functools.partial(next, iter(responses))
|
242 |
|
@@ -270,12 +279,12 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
270 |
responses = [
|
271 |
Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
|
272 |
Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
|
273 |
-
Mock(type="content", content="Accept: No"), # NODE_PPROMPT_ANALYZER
|
274 |
Mock(type="content", content="Try using the `reverse()` method instead."), # NODE_PROMPT_SUGGESTER
|
275 |
Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."), # NODE_PROMPT_DEVELOPER
|
276 |
Mock(type="content", content="Here's one way: `my_list.reverse()`"), # NODE_PROMPT_EXECUTOR
|
277 |
-
Mock(type="content", content="
|
278 |
-
Mock(type="content", content="Accept: Yes"), # NODE_PPROMPT_ANALYZER
|
279 |
]
|
280 |
llm.invoke = lambda _: responses.pop(0)
|
281 |
|
@@ -303,7 +312,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
303 |
"""
|
304 |
|
305 |
llms = {
|
306 |
-
NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name="
|
307 |
}
|
308 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
309 |
workflow = meta_prompt_graph._create_acceptance_criteria_workflow()
|
|
|
7 |
from meta_prompt import *
|
8 |
from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
|
9 |
from langgraph.graph import END
|
10 |
+
import os
|
11 |
+
# from dotenv import load_dotenv
|
12 |
+
|
13 |
+
# load_dotenv()
|
14 |
|
15 |
class TestMetaPromptGraph(unittest.TestCase):
|
16 |
def setUp(self):
|
|
|
58 |
llms = {
|
59 |
"output_history_analyzer": MagicMock(
|
60 |
invoke=lambda prompt: MagicMock(
|
61 |
+
content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"
|
|
|
|
|
|
|
|
|
|
|
62 |
)
|
63 |
)
|
64 |
}
|
|
|
98 |
"""
|
99 |
llms = {
|
100 |
NODE_PROMPT_ANALYZER: MagicMock(
|
101 |
+
invoke=lambda prompt: MagicMock(content="{\"Accept\": \"Yes\"}")
|
102 |
)
|
103 |
}
|
104 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
|
|
132 |
executes it with a given input state. It then verifies that the output
|
133 |
state contains the expected keys and values.
|
134 |
"""
|
135 |
+
model_name = os.getenv("TEST_MODEL_NAME_EXECUTOR")
|
136 |
+
raw_llm = ChatOpenAI(model_name=model_name)
|
137 |
|
138 |
+
llms = {
|
139 |
+
NODE_PROMPT_INITIAL_DEVELOPER: raw_llm,
|
140 |
+
NODE_ACCEPTANCE_CRITERIA_DEVELOPER: raw_llm,
|
141 |
+
NODE_PROMPT_DEVELOPER: raw_llm,
|
142 |
+
NODE_PROMPT_EXECUTOR: raw_llm,
|
143 |
+
NODE_OUTPUT_HISTORY_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
|
144 |
+
NODE_PROMPT_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
|
145 |
+
NODE_PROMPT_SUGGESTER: raw_llm,
|
146 |
+
}
|
147 |
+
|
148 |
+
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
149 |
input_state = AgentState(
|
150 |
user_message="How do I reverse a list in Python?",
|
151 |
expected_output="Use the `[::-1]` slicing technique or the "
|
|
|
170 |
|
171 |
user_message = "How can I create a list of numbers in Python?"
|
172 |
messages = [("system", output_state["best_system_message"]), ("human", user_message)]
|
173 |
+
result = raw_llm.invoke(messages)
|
174 |
|
175 |
assert hasattr(result, "content"), "The result should have the attribute 'content'"
|
176 |
print(result.content)
|
|
|
185 |
state contains the expected keys and values.
|
186 |
"""
|
187 |
optimizer_llm = ChatOpenAI(
|
188 |
+
model_name=os.getenv("TEST_MODEL_NAME_OPTIMIZER"), temperature=0.5
|
189 |
)
|
190 |
executor_llm = ChatOpenAI(
|
191 |
+
model_name=os.getenv("TEST_MODEL_NAME_EXECUTOR"), temperature=0.01
|
192 |
)
|
193 |
|
194 |
llms = {
|
|
|
197 |
NODE_PROMPT_DEVELOPER: optimizer_llm,
|
198 |
NODE_PROMPT_EXECUTOR: executor_llm,
|
199 |
NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
|
200 |
+
NODE_PROMPT_ANALYZER: optimizer_llm.bind(response_format={"type": "json_object"}),
|
201 |
NODE_PROMPT_SUGGESTER: optimizer_llm,
|
202 |
}
|
203 |
|
|
|
245 |
responses = [
|
246 |
Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
|
247 |
Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
|
248 |
+
Mock(type="content", content="{\"Accept\": \"Yes\"}"), # NODE_PPROMPT_ANALYZER
|
249 |
]
|
250 |
llm.invoke = functools.partial(next, iter(responses))
|
251 |
|
|
|
279 |
responses = [
|
280 |
Mock(type="content", content="Explain how to reverse a list in Python."), # NODE_PROMPT_INITIAL_DEVELOPER
|
281 |
Mock(type="content", content="Here's one way: `my_list[::-1]`"), # NODE_PROMPT_EXECUTOR
|
282 |
+
Mock(type="content", content="{\"Accept\": \"No\"}"), # NODE_PPROMPT_ANALYZER
|
283 |
Mock(type="content", content="Try using the `reverse()` method instead."), # NODE_PROMPT_SUGGESTER
|
284 |
Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."), # NODE_PROMPT_DEVELOPER
|
285 |
Mock(type="content", content="Here's one way: `my_list.reverse()`"), # NODE_PROMPT_EXECUTOR
|
286 |
+
Mock(type="content", content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"), # NODE_OUTPUT_HISTORY_ANALYZER
|
287 |
+
Mock(type="content", content="{\"Accept\": \"Yes\"}"), # NODE_PPROMPT_ANALYZER
|
288 |
]
|
289 |
llm.invoke = lambda _: responses.pop(0)
|
290 |
|
|
|
312 |
"""
|
313 |
|
314 |
llms = {
|
315 |
+
NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name=os.getenv("TEST_MODEL_NAME_ACCEPTANCE_CRITERIA_DEVELOPER"))
|
316 |
}
|
317 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
318 |
workflow = meta_prompt_graph._create_acceptance_criteria_workflow()
|