Spaces:

yaleh
/

meta-prompt

Running

App Files Files Community

yaleh commited on Sep 6, 2024

Commit

a291864

1 Parent(s): 80da0e3

Unit tests work now.

Browse files

Files changed (4) hide show

app/gradio_meta_prompt_utils.py +6 -0
meta_prompt/consts.py +77 -69
meta_prompt/meta_prompt.py +0 -5
tests/meta_prompt_graph_test.py +28 -19

app/gradio_meta_prompt_utils.py CHANGED Viewed

@@ -466,6 +466,12 @@ def process_message_with_models(
         NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
         NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
     }
     meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
                                         aggressive_exploration=aggressive_exploration,
                                         verbose=config.verbose, logger=logger)

         NODE_PROMPT_ANALYZER: initialize_llm(config, analyzer_model_name, {'temperature': analyzer_temperature}),
         NODE_PROMPT_SUGGESTER: initialize_llm(config, suggester_model_name, {'temperature': suggester_temperature})
     }
+    # Bind response_format to llm here
+    nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
+    for node in nodes_to_bind:
+        llms[node] = llms[node].bind(response_format={"type": "json_object"})
     meta_prompt_graph = MetaPromptGraph(llms=llms, prompts=prompt_templates,
                                         aggressive_exploration=aggressive_exploration,
                                         verbose=config.verbose, logger=logger)

meta_prompt/consts.py CHANGED Viewed

@@ -166,85 +166,93 @@ Create a [name], Here's the descriptions [description]. Start with "GPT Descript
         ("human", "{user_message}")
     ]),
     NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
-        ("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the Expected Output with two different outputs, and decide which one is closer to the Expected Output. When comparing the outputs, ignore the differences which are acceptable or ignorable according to the Acceptance Criteria.
-You output the following analysis according to the Acceptance Criteria:
-* Your analysis in a Markdown list.
-* Indicates an output ID that is closer to the Expected Output, in the following format:
-```
-# Analysis
-...
-# Output ID closer to Expected Output: [ID]
-```
-You must choose one of the two outputs. If both outputs are exactly the same, output the following:
-```
-# Analysis
-...
-# Draw
-```
 """),
-        ("human", """
-# Output ID: A
-```
-{best_output}
-```
-# Output ID: B
-```
-{output}
-```
-# Acceptance Criteria
-{acceptance_criteria}
-# Expected Output
-```
-{expected_output}
-```
 """)
     ]),
     NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
-        ("system", """You are a text comparing program. You compare the following output texts, analysis the System Message and provide a detailed analysis according to `Acceptance Criteria`. Then you decide whether `Actual Output` is acceptable.
-Provide your analysis in the following format:
-```
-- Acceptable Differences: [List acceptable differences succinctly]
-- Unacceptable Differences: [List unacceptable differences succinctly]
-- Accept: [Yes/No]
 ```
-* Compare Expected Output and Actual Output with the guidance of Accept Criteria.
-* Only set 'Accept' to 'Yes', if Accept Criteria are all met. Otherwise, set 'Accept' to 'No'.
-* List only the acceptable differences according to Accept Criteria in 'acceptable Differences' section.
-* List only the unacceptable differences according to Accept Criteria in 'Unacceptable Differences' section.
 # Acceptance Criteria
-```
 {acceptance_criteria}
-```
 """),
-        ("human", """
-# System Message
-```
-{system_message}
-```
-# Expected Output
 ```
 {expected_output}

         ("human", "{user_message}")
     ]),
     NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
+        ("system", """{{
+"task_description": "You are a text comparing program. Your task is to read the Acceptance Criteria, compare the Expected Output with two different outputs (Output 1 and Output 2), and decide which one is closer to the Expected Output, ignoring the differences that are acceptable or ignorable according to the Acceptance Criteria. Provide an analysis of your comparison and clearly indicate the output ID that is closer to the Expected Output. Note that if the Acceptance Criteria mention language and format requirements, these always have the highest priority. Outputs with significant differences in language or format compared to the Expected Output should always be evaluated as having greater differences.",
+"requirements": [
+    "Read and understand the provided Acceptance Criteria carefully.",
+    "Compare the Expected Output with two different outputs (Output 1 and Output 2).",
+    "Ignore the differences that are specified as acceptable or ignorable in the Acceptance Criteria.",
+    "Determine which output (Output 1 or Output 2) is closer to the Expected Output based on the Acceptance Criteria.",
+    "Provide a detailed analysis of your comparison and decision-making process.",
+    "Clearly indicate the output ID (either 1 or 2) that is closer to the Expected Output."
+],
+"output_format": {{
+    "type": "object",
+    "properties": {{
+    "analysis": {{
+        "type": "string",
+        "description": "A detailed analysis explaining the comparison and decision-making process based on the Acceptance Criteria."
+    }},
+    "closerOutputID": {{
+        "type": "integer",
+        "description": "The output ID (1 or 2) that is closer to the Expected Output, or 0 if both outputs are equally close."
+    }}
+    }},
+    "required": [
+    "analysis",
+    "closerOutputID"
+    ]
+}},
+"output_example": {{
+    "analysis": "The Acceptance Criteria specified that the output should be in English and follow a specific JSON format. Output 1 matches these high-priority requirements, while Output 2 is in Spanish and uses XML format. Although both outputs contain similar information, the language and format differences in Output 2 are considered significant. Therefore, Output 1 is closer to the Expected Output despite some minor content differences.",
+    "closerOutputID": 1
+}},
+"evaluation_criteria": [
+    "The analysis should demonstrate a clear understanding of the Acceptance Criteria, with the highest priority given to language and format requirements if specified.",
+    "The comparison should accurately identify and ignore acceptable or ignorable differences, while emphasizing significant language or format discrepancies.",
+    "The decision should be based on a thorough analysis of the outputs in relation to the Expected Output, prioritizing language and format matching when required.",
+    "The output ID indicated as closer to the Expected Output should align with the analysis, reflecting the importance of language and format requirements."
+],
+"error_handling": [
+    "If the Acceptance Criteria are unclear or contradictory, provide an analysis explaining the ambiguity and suggest possible interpretations.",
+    "If neither output is closer to the Expected Output, provide an analysis explaining why and use \"closerOutputID\": 0."
+],
+"ethical_considerations": [
+    "Ensure that the comparison process is unbiased and solely based on the Acceptance Criteria.",
+    "Do not introduce personal opinions or preferences into the analysis."
+],
+"conclusion": "Confirm that your output adheres to the specified language and format, includes a detailed analysis, and clearly indicates the closer output ID based on the Acceptance Criteria."
+}}
 """),
+        ("human", """<|Start_Output_ID_1|>{best_output}<|End_Output_ID_1|>
+<|Start_Output_ID_2|>{output}<|End_Output_ID_2|>
+<|Start_Acceptance_Criteria|>{acceptance_criteria}<|End_Acceptance_Criteria|>
+<|Start_Expected_Output|>{expected_output}<|End_Expected_Output|>
 """)
     ]),
     NODE_PROMPT_ANALYZER: ChatPromptTemplate.from_messages([
+        ("system", """**TASK:** Compare the Expected Output with the Actual Output according to the Acceptance Criteria. Provide a JSON output with your analysis.
+**Requirements:**
+- Compare Expected and Actual Outputs strictly following the Acceptance Criteria.
+- Set `Accept` to "Yes" only if all criteria are met; otherwise, set it to "No."
+- List acceptable and unacceptable differences based on the criteria.
+**Output Format:** JSON with:
+- `Accept: (Yes/No)`
+- `Acceptable Differences: []`
+- `Unacceptable Differences: []`
+**Example Output:**
+```json
+{{
+    "Accept": "No",
+    "Acceptable Differences": [
+        "Spelling variations: 'colour' vs 'color'"
+    ],
+    "Unacceptable Differences": [
+        "Missing section: 'Conclusion'",
+        "Incorrect date format: '2023/10/12' vs '12-10-2023'"
+    ]
+}}
 ```
 # Acceptance Criteria
 {acceptance_criteria}
 """),
+        ("human", """# Expected Output
 ```
 {expected_output}

meta_prompt/meta_prompt.py CHANGED Viewed

@@ -116,11 +116,6 @@ class MetaPromptGraph:
         self.prompt_templates.update(prompts)
         self.aggressive_exploration = aggressive_exploration
-        # Bind response_format to llm here
-        nodes_to_bind = [NODE_OUTPUT_HISTORY_ANALYZER, NODE_PROMPT_ANALYZER, NODE_PROMPT_SUGGESTER]
-        for node in nodes_to_bind:
-            self.llms[node] = self.llms[node].bind(response_format={"type": "json_object"})
     def _create_acceptance_criteria_workflow(self) -> StateGraph:
         """

         self.prompt_templates.update(prompts)
         self.aggressive_exploration = aggressive_exploration
     def _create_acceptance_criteria_workflow(self) -> StateGraph:
         """

tests/meta_prompt_graph_test.py CHANGED Viewed

@@ -7,6 +7,10 @@ from langchain_openai import ChatOpenAI
 from meta_prompt import *
 from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
 from langgraph.graph import END
 class TestMetaPromptGraph(unittest.TestCase):
     def setUp(self):
@@ -54,12 +58,7 @@ class TestMetaPromptGraph(unittest.TestCase):
         llms = {
             "output_history_analyzer": MagicMock(
                 invoke=lambda prompt: MagicMock(
-                    content="""# Analysis
-    This analysis compares two outputs to the expected output based on specific
-    criteria.
-    # Output ID closer to Expected Output: B"""
                 )
             )
         }
@@ -99,7 +98,7 @@ class TestMetaPromptGraph(unittest.TestCase):
         """
         llms = {
             NODE_PROMPT_ANALYZER: MagicMock(
-                invoke=lambda prompt: MagicMock(content="Accept: Yes")
             )
         }
         meta_prompt_graph = MetaPromptGraph(llms=llms)
@@ -133,10 +132,20 @@ class TestMetaPromptGraph(unittest.TestCase):
         executes it with a given input state. It then verifies that the output
         state contains the expected keys and values.
         """
-        model_name = "google/gemma-2-9b-it"
-        llm = ChatOpenAI(model_name=model_name)
-        meta_prompt_graph = MetaPromptGraph(llms=llm)
         input_state = AgentState(
             user_message="How do I reverse a list in Python?",
             expected_output="Use the `[::-1]` slicing technique or the "
@@ -161,7 +170,7 @@ class TestMetaPromptGraph(unittest.TestCase):
         user_message = "How can I create a list of numbers in Python?"
         messages = [("system", output_state["best_system_message"]), ("human", user_message)]
-        result = llm.invoke(messages)
         assert hasattr(result, "content"), "The result should have the attribute 'content'"
         print(result.content)
@@ -176,10 +185,10 @@ class TestMetaPromptGraph(unittest.TestCase):
         state contains the expected keys and values.
         """
         optimizer_llm = ChatOpenAI(
-            model_name="deepseek/deepseek-chat", temperature=0.5
         )
         executor_llm = ChatOpenAI(
-            model_name="meta-llama/llama-3-8b-instruct", temperature=0.01
         )
         llms = {
@@ -188,7 +197,7 @@ class TestMetaPromptGraph(unittest.TestCase):
             NODE_PROMPT_DEVELOPER: optimizer_llm,
             NODE_PROMPT_EXECUTOR: executor_llm,
             NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
-            NODE_PROMPT_ANALYZER: optimizer_llm,
             NODE_PROMPT_SUGGESTER: optimizer_llm,
         }
@@ -236,7 +245,7 @@ class TestMetaPromptGraph(unittest.TestCase):
         responses = [
             Mock(type="content", content="Explain how to reverse a list in Python."),  # NODE_PROMPT_INITIAL_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list[::-1]`"),  # NODE_PROMPT_EXECUTOR
-            Mock(type="content", content="Accept: Yes"),  # NODE_PPROMPT_ANALYZER
         ]
         llm.invoke = functools.partial(next, iter(responses))
@@ -270,12 +279,12 @@ class TestMetaPromptGraph(unittest.TestCase):
         responses = [
             Mock(type="content", content="Explain how to reverse a list in Python."),  # NODE_PROMPT_INITIAL_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list[::-1]`"),  # NODE_PROMPT_EXECUTOR
-            Mock(type="content", content="Accept: No"),  # NODE_PPROMPT_ANALYZER
             Mock(type="content", content="Try using the `reverse()` method instead."),  # NODE_PROMPT_SUGGESTER
             Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."),  # NODE_PROMPT_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list.reverse()`"),  # NODE_PROMPT_EXECUTOR
-            Mock(type="content", content="# Output ID closer to Expected Output: B"), # NODE_OUTPUT_HISTORY_ANALYZER
-            Mock(type="content", content="Accept: Yes"),  # NODE_PPROMPT_ANALYZER
         ]
         llm.invoke = lambda _: responses.pop(0)
@@ -303,7 +312,7 @@ class TestMetaPromptGraph(unittest.TestCase):
         """
         llms = {
-            NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name="deepseek/deepseek-chat")
         }
         meta_prompt_graph = MetaPromptGraph(llms=llms)
         workflow = meta_prompt_graph._create_acceptance_criteria_workflow()

 from meta_prompt import *
 from meta_prompt.consts import NODE_ACCEPTANCE_CRITERIA_DEVELOPER
 from langgraph.graph import END
+import os
+# from dotenv import load_dotenv
+# load_dotenv()
 class TestMetaPromptGraph(unittest.TestCase):
     def setUp(self):
         llms = {
             "output_history_analyzer": MagicMock(
                 invoke=lambda prompt: MagicMock(
+                    content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"
                 )
             )
         }
         """
         llms = {
             NODE_PROMPT_ANALYZER: MagicMock(
+                invoke=lambda prompt: MagicMock(content="{\"Accept\": \"Yes\"}")
             )
         }
         meta_prompt_graph = MetaPromptGraph(llms=llms)
         executes it with a given input state. It then verifies that the output
         state contains the expected keys and values.
         """
+        model_name = os.getenv("TEST_MODEL_NAME_EXECUTOR")
+        raw_llm = ChatOpenAI(model_name=model_name)
+        llms = {
+            NODE_PROMPT_INITIAL_DEVELOPER: raw_llm,
+            NODE_ACCEPTANCE_CRITERIA_DEVELOPER: raw_llm,
+            NODE_PROMPT_DEVELOPER: raw_llm,
+            NODE_PROMPT_EXECUTOR: raw_llm,
+            NODE_OUTPUT_HISTORY_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
+            NODE_PROMPT_ANALYZER: raw_llm.bind(response_format={"type": "json_object"}),
+            NODE_PROMPT_SUGGESTER: raw_llm,
+        }
+        meta_prompt_graph = MetaPromptGraph(llms=llms)
         input_state = AgentState(
             user_message="How do I reverse a list in Python?",
             expected_output="Use the `[::-1]` slicing technique or the "
         user_message = "How can I create a list of numbers in Python?"
         messages = [("system", output_state["best_system_message"]), ("human", user_message)]
+        result = raw_llm.invoke(messages)
         assert hasattr(result, "content"), "The result should have the attribute 'content'"
         print(result.content)
         state contains the expected keys and values.
         """
         optimizer_llm = ChatOpenAI(
+            model_name=os.getenv("TEST_MODEL_NAME_OPTIMIZER"), temperature=0.5
         )
         executor_llm = ChatOpenAI(
+            model_name=os.getenv("TEST_MODEL_NAME_EXECUTOR"), temperature=0.01
         )
         llms = {
             NODE_PROMPT_DEVELOPER: optimizer_llm,
             NODE_PROMPT_EXECUTOR: executor_llm,
             NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
+            NODE_PROMPT_ANALYZER: optimizer_llm.bind(response_format={"type": "json_object"}),
             NODE_PROMPT_SUGGESTER: optimizer_llm,
         }
         responses = [
             Mock(type="content", content="Explain how to reverse a list in Python."),  # NODE_PROMPT_INITIAL_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list[::-1]`"),  # NODE_PROMPT_EXECUTOR
+            Mock(type="content", content="{\"Accept\": \"Yes\"}"),  # NODE_PPROMPT_ANALYZER
         ]
         llm.invoke = functools.partial(next, iter(responses))
         responses = [
             Mock(type="content", content="Explain how to reverse a list in Python."),  # NODE_PROMPT_INITIAL_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list[::-1]`"),  # NODE_PROMPT_EXECUTOR
+            Mock(type="content", content="{\"Accept\": \"No\"}"),  # NODE_PPROMPT_ANALYZER
             Mock(type="content", content="Try using the `reverse()` method instead."),  # NODE_PROMPT_SUGGESTER
             Mock(type="content", content="Explain how to reverse a list in Python. Output in a Markdown List."),  # NODE_PROMPT_DEVELOPER
             Mock(type="content", content="Here's one way: `my_list.reverse()`"),  # NODE_PROMPT_EXECUTOR
+            Mock(type="content", content="{\"closerOutputID\": 2, \"analysis\": \"The output should use the `reverse()` method.\"}"), # NODE_OUTPUT_HISTORY_ANALYZER
+            Mock(type="content", content="{\"Accept\": \"Yes\"}"),  # NODE_PPROMPT_ANALYZER
         ]
         llm.invoke = lambda _: responses.pop(0)
         """
         llms = {
+            NODE_ACCEPTANCE_CRITERIA_DEVELOPER: ChatOpenAI(model_name=os.getenv("TEST_MODEL_NAME_ACCEPTANCE_CRITERIA_DEVELOPER"))
         }
         meta_prompt_graph = MetaPromptGraph(llms=llms)
         workflow = meta_prompt_graph._create_acceptance_criteria_workflow()