yaleh commited on
Commit
675dd1e
·
1 Parent(s): 881f7b2

Refactor code and update system message formatting guidelines.

Browse files
app/examples/log.csv CHANGED
@@ -329,3 +329,4 @@ Therefore, the area of each triangle is 15 cm^2, 6 cm^2, 8 cm^2, 60 cm^2, and 31
329
  * Acceptable differences:
330
  * Different code examples
331
  * Minor text differences","As a Code Tutor Assistant, you are designed to handle advanced-level queries that require expertise in writing and explaining code snippets."
 
 
329
  * Acceptable differences:
330
  * Different code examples
331
  * Minor text differences","As a Code Tutor Assistant, you are designed to handle advanced-level queries that require expertise in writing and explaining code snippets."
332
+ "If it takes 8 bits to make a byte, how many bits are there in a kilobyte?","There are 8,192 bits in a kilobyte. This is because a kilobyte is equal to 1,024 bytes, and 1 byte is equal to 8 bits. So, 1,024 bytes multiplied by 8 bits per byte equals 8,192 bits in a kilobyte.",Exactly format and style match. Consistent semantic. Highly similar text length.,
app/gradio_meta_prompt.py CHANGED
@@ -148,7 +148,13 @@ def evaluate_system_message(system_message, user_message, simple_model, executor
148
  ("human", "{user_message}")
149
  ])
150
  messages = template.format_messages(system_message=system_message, user_message=user_message)
151
- output = llm.invoke(messages)
 
 
 
 
 
 
152
 
153
  if hasattr(output, 'content'):
154
  return output.content
@@ -160,7 +166,6 @@ def process_message(user_message, expected_output, acceptance_criteria,
160
  initial_system_message, recursion_limit: int,
161
  max_output_age: int,
162
  llms: Union[BaseLanguageModel, Dict[str, BaseLanguageModel]]):
163
- # Create the input state
164
  input_state = AgentState(
165
  user_message=user_message,
166
  expected_output=expected_output,
@@ -169,49 +174,35 @@ def process_message(user_message, expected_output, acceptance_criteria,
169
  max_output_age=max_output_age
170
  )
171
 
172
- # Get the output state from MetaPromptGraph
173
  log_stream = io.StringIO()
174
- log_handler = None
175
- logger = None
176
- if config.verbose:
177
- log_handler = logging.StreamHandler(log_stream)
178
- logger = logging.getLogger(MetaPromptGraph.__name__)
179
  log_handler.setFormatter(jsonlogger.JsonFormatter(
180
  '%(asctime)s %(name)s %(levelname)s %(message)s'))
181
  logger.addHandler(log_handler)
182
 
183
- meta_prompt_graph = MetaPromptGraph(
184
- llms=llms, verbose=config.verbose, logger=logger)
185
- output_state = meta_prompt_graph(input_state, recursion_limit=recursion_limit)
 
 
 
 
 
186
 
187
- if config.verbose:
188
  log_handler.close()
189
  log_output = log_stream.getvalue()
190
  else:
191
  log_output = None
192
 
193
- # Validate the output state
194
- system_message = ''
195
- output = ''
196
- analysis = ''
197
-
198
- if 'best_system_message' in output_state and output_state['best_system_message'] is not None:
199
- system_message = output_state['best_system_message']
200
- else:
201
- system_message = "Error: The output state does not contain a valid 'best_system_message'"
202
-
203
- if 'best_output' in output_state and output_state['best_output'] is not None:
204
- output = output_state["best_output"]
205
- else:
206
- output = "Error: The output state does not contain a valid 'best_output'"
207
 
208
- if 'analysis' in output_state and output_state['analysis'] is not None:
209
- analysis = output_state['analysis']
210
- else:
211
- analysis = "Error: The output state does not contain a valid 'analysis'"
212
 
213
- return (system_message, output, analysis,
214
- chat_log_2_chatbot_list(log_output))
215
 
216
 
217
  def process_message_with_single_llm(user_message, expected_output, acceptance_criteria, initial_system_message,
 
148
  ("human", "{user_message}")
149
  ])
150
  messages = template.format_messages(system_message=system_message, user_message=user_message)
151
+ try:
152
+ output = llm.invoke(messages)
153
+ except Exception as e:
154
+ if isinstance(e, gr.Error):
155
+ raise e
156
+ else:
157
+ raise gr.Error(f"Error: {e}")
158
 
159
  if hasattr(output, 'content'):
160
  return output.content
 
166
  initial_system_message, recursion_limit: int,
167
  max_output_age: int,
168
  llms: Union[BaseLanguageModel, Dict[str, BaseLanguageModel]]):
 
169
  input_state = AgentState(
170
  user_message=user_message,
171
  expected_output=expected_output,
 
174
  max_output_age=max_output_age
175
  )
176
 
 
177
  log_stream = io.StringIO()
178
+ logger = logging.getLogger(MetaPromptGraph.__name__) if config.verbose else None
179
+ log_handler = logging.StreamHandler(log_stream) if logger else None
180
+ if log_handler:
 
 
181
  log_handler.setFormatter(jsonlogger.JsonFormatter(
182
  '%(asctime)s %(name)s %(levelname)s %(message)s'))
183
  logger.addHandler(log_handler)
184
 
185
+ meta_prompt_graph = MetaPromptGraph(llms=llms, verbose=config.verbose, logger=logger)
186
+ try:
187
+ output_state = meta_prompt_graph(input_state, recursion_limit=recursion_limit)
188
+ except Exception as e:
189
+ if isinstance(e, gr.Error):
190
+ raise e
191
+ else:
192
+ raise gr.Error(f"Error: {e}")
193
 
194
+ if log_handler:
195
  log_handler.close()
196
  log_output = log_stream.getvalue()
197
  else:
198
  log_output = None
199
 
200
+ system_message = output_state.get('best_system_message', "Error: The output state does not contain a valid 'best_system_message'")
201
+ output = output_state.get('best_output', "Error: The output state does not contain a valid 'best_output'")
202
+ analysis = output_state.get('analysis', "Error: The output state does not contain a valid 'analysis'")
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ return (system_message, output, analysis, chat_log_2_chatbot_list(log_output))
 
 
 
205
 
 
 
206
 
207
 
208
  def process_message_with_single_llm(user_message, expected_output, acceptance_criteria, initial_system_message,
meta_prompt/consts.py CHANGED
@@ -28,7 +28,8 @@ You are an expert prompt engineer tasked with creating system messages for AI as
28
  2. Ensure the system message can handle similar user messages.
29
  3. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
30
  4. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
31
- 5. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
 
32
 
33
  ## Output
34
 
@@ -59,8 +60,9 @@ You are an expert prompt engineer tasked with updating system messages for AI as
59
  4. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
60
  5. Avoiding the behavior should be explicitly requested (e.g. `Don't ...`) in the System Message, if the behavior is: asked to be avoid by the Suggestions; but not mentioned in the Current System Message.
61
  6. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
62
- 7. Remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
63
- 8. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
 
64
 
65
  ## Output
66
 
@@ -91,12 +93,12 @@ Provide only the updated System Message, adhering to the above guidelines.
91
  ("human", "{user_message}")
92
  ]),
93
  NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
94
- ("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the exptected output with two different outputs, and decide which one is more consistent with the expected output. When comparing the outputs, ignore the differences which are acceptable or ignorable according to the Acceptance Criteria.
95
 
96
  You output the following analysis according to the Acceptance Criteria:
97
 
98
  * Your analysis in a Markdown list.
99
- * Indicates an output ID that is more consistent with the expected output, in the following format:
100
 
101
  ```
102
  # Analysis
@@ -106,7 +108,7 @@ You output the following analysis according to the Acceptance Criteria:
106
  # Preferred Output ID: [ID]
107
  ```
108
 
109
- If both outputs are equally similar to the expected output, output the following:
110
 
111
  ```
112
  # Analysis
@@ -191,6 +193,7 @@ Provide your analysis in the following format:
191
  * Provide your suggestions in a Markdown list, nothing else. Output only the suggestions related with Unacceptable Differences.
192
  * Start every suggestion with `The System Message should ...`.
193
  * Figue out the contexts of the System Message that conflict with the suggestions, and suggest modification or deletion.
 
194
  * Avoiding the behavior should be explicitly requested (e.g. `The System Message should explicitly state that the output shoud not ...`) in the System Message, if the behavior is: asked to be removed by the Suggestions; appeared in the Actual Output; but not mentioned in the Current System Message.
195
  * Expected Output text should not appear in System Message as an example. But it's OK to use some similar but distinct text as an example instead.
196
  * Ask to remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
 
28
  2. Ensure the system message can handle similar user messages.
29
  3. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
30
  4. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
31
+ 5. In the System Message, do not use `Expected Output` to refer to the example you want to illustrate. Instead, directly describe the specific features you need.
32
+ 6. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
33
 
34
  ## Output
35
 
 
60
  4. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
61
  5. Avoiding the behavior should be explicitly requested (e.g. `Don't ...`) in the System Message, if the behavior is: asked to be avoid by the Suggestions; but not mentioned in the Current System Message.
62
  6. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
63
+ 7. In the System Message, do not use `Expected Output` to refer to the example you want to illustrate. Instead, directly describe the specific features you need.
64
+ 8. Remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
65
+ 9. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
66
 
67
  ## Output
68
 
 
93
  ("human", "{user_message}")
94
  ]),
95
  NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
96
+ ("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the Expected Output with two different outputs, and decide which one is more consistent with the Expected Output. When comparing the outputs, ignore the differences which are acceptable or ignorable according to the Acceptance Criteria.
97
 
98
  You output the following analysis according to the Acceptance Criteria:
99
 
100
  * Your analysis in a Markdown list.
101
+ * Indicates an output ID that is more consistent with the Expected Output, in the following format:
102
 
103
  ```
104
  # Analysis
 
108
  # Preferred Output ID: [ID]
109
  ```
110
 
111
+ If both outputs are equally similar to the Expected Output, output the following:
112
 
113
  ```
114
  # Analysis
 
193
  * Provide your suggestions in a Markdown list, nothing else. Output only the suggestions related with Unacceptable Differences.
194
  * Start every suggestion with `The System Message should ...`.
195
  * Figue out the contexts of the System Message that conflict with the suggestions, and suggest modification or deletion.
196
+ * Do not simply describe the output as being the same/similar/different from the Expected Output, such as `the output should not use a different format and style compared to the Expected Output` or `the output should match the expected output exactly`; instead, describe the expected characteristics specifically and suggest a detailed example.
197
  * Avoiding the behavior should be explicitly requested (e.g. `The System Message should explicitly state that the output shoud not ...`) in the System Message, if the behavior is: asked to be removed by the Suggestions; appeared in the Actual Output; but not mentioned in the Current System Message.
198
  * Expected Output text should not appear in System Message as an example. But it's OK to use some similar but distinct text as an example instead.
199
  * Ask to remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
tests/meta_prompt_graph_test.py CHANGED
@@ -4,11 +4,11 @@ import logging
4
  from unittest.mock import MagicMock
5
  from unittest.mock import patch
6
 
7
- # Assuming the necessary imports are made for the classes and functions used in meta_prompt_graph.py
8
- from meta_prompt import AgentState, MetaPromptGraph
9
-
10
  from langchain_openai import ChatOpenAI
11
 
 
 
 
12
  class TestMetaPromptGraph(unittest.TestCase):
13
  def setUp(self):
14
  # logging.basicConfig(level=logging.DEBUG)
@@ -16,7 +16,7 @@ class TestMetaPromptGraph(unittest.TestCase):
16
 
17
  def test_prompt_node(self):
18
  llms = {
19
- MetaPromptGraph.NODE_PROMPT_INITIAL_DEVELOPER: MagicMock(
20
  invoke=MagicMock(return_value=MagicMock(content="Mocked response content"))
21
  )
22
  }
@@ -29,7 +29,7 @@ class TestMetaPromptGraph(unittest.TestCase):
29
 
30
  # Invoke the _prompt_node method with the mock node, target attribute, and state
31
  updated_state = graph._prompt_node(
32
- MetaPromptGraph.NODE_PROMPT_INITIAL_DEVELOPER, "output", state
33
  )
34
 
35
  # Assertions
@@ -70,7 +70,7 @@ class TestMetaPromptGraph(unittest.TestCase):
70
 
71
  def test_prompt_analyzer_accept(self):
72
  llms = {
73
- MetaPromptGraph.NODE_PROMPT_ANALYZER: MagicMock(
74
  invoke=lambda prompt: MagicMock(content="Accept: Yes"))
75
  }
76
  meta_prompt_graph = MetaPromptGraph(llms)
@@ -78,6 +78,11 @@ class TestMetaPromptGraph(unittest.TestCase):
78
  updated_state = meta_prompt_graph._prompt_analyzer(state)
79
  assert updated_state.accepted == True
80
 
 
 
 
 
 
81
  def test_workflow_execution(self):
82
  # MODEL_NAME = "anthropic/claude-3.5-sonnet:beta"
83
  # MODEL_NAME = "meta-llama/llama-3-70b-instruct"
@@ -120,12 +125,12 @@ class TestMetaPromptGraph(unittest.TestCase):
120
  executor_llm = ChatOpenAI(model_name="meta-llama/llama-3-8b-instruct", temperature=0.01)
121
 
122
  llms = {
123
- MetaPromptGraph.NODE_PROMPT_INITIAL_DEVELOPER: optimizer_llm,
124
- MetaPromptGraph.NODE_PROMPT_DEVELOPER: optimizer_llm,
125
- MetaPromptGraph.NODE_PROMPT_EXECUTOR: executor_llm,
126
- MetaPromptGraph.NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
127
- MetaPromptGraph.NODE_PROMPT_ANALYZER: optimizer_llm,
128
- MetaPromptGraph.NODE_PROMPT_SUGGESTER: optimizer_llm
129
  }
130
 
131
  meta_prompt_graph = MetaPromptGraph(llms=llms)
 
4
  from unittest.mock import MagicMock
5
  from unittest.mock import patch
6
 
 
 
 
7
  from langchain_openai import ChatOpenAI
8
 
9
+ # Assuming the necessary imports are made for the classes and functions used in meta_prompt_graph.py
10
+ from meta_prompt import *
11
+
12
  class TestMetaPromptGraph(unittest.TestCase):
13
  def setUp(self):
14
  # logging.basicConfig(level=logging.DEBUG)
 
16
 
17
  def test_prompt_node(self):
18
  llms = {
19
+ NODE_PROMPT_INITIAL_DEVELOPER: MagicMock(
20
  invoke=MagicMock(return_value=MagicMock(content="Mocked response content"))
21
  )
22
  }
 
29
 
30
  # Invoke the _prompt_node method with the mock node, target attribute, and state
31
  updated_state = graph._prompt_node(
32
+ NODE_PROMPT_INITIAL_DEVELOPER, "output", state
33
  )
34
 
35
  # Assertions
 
70
 
71
  def test_prompt_analyzer_accept(self):
72
  llms = {
73
+ NODE_PROMPT_ANALYZER: MagicMock(
74
  invoke=lambda prompt: MagicMock(content="Accept: Yes"))
75
  }
76
  meta_prompt_graph = MetaPromptGraph(llms)
 
78
  updated_state = meta_prompt_graph._prompt_analyzer(state)
79
  assert updated_state.accepted == True
80
 
81
+ def test_get_node_names(self):
82
+ graph = MetaPromptGraph()
83
+ node_names = graph.get_node_names()
84
+ self.assertEqual(node_names, META_PROMPT_NODES)
85
+
86
  def test_workflow_execution(self):
87
  # MODEL_NAME = "anthropic/claude-3.5-sonnet:beta"
88
  # MODEL_NAME = "meta-llama/llama-3-70b-instruct"
 
125
  executor_llm = ChatOpenAI(model_name="meta-llama/llama-3-8b-instruct", temperature=0.01)
126
 
127
  llms = {
128
+ NODE_PROMPT_INITIAL_DEVELOPER: optimizer_llm,
129
+ NODE_PROMPT_DEVELOPER: optimizer_llm,
130
+ NODE_PROMPT_EXECUTOR: executor_llm,
131
+ NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
132
+ NODE_PROMPT_ANALYZER: optimizer_llm,
133
+ NODE_PROMPT_SUGGESTER: optimizer_llm
134
  }
135
 
136
  meta_prompt_graph = MetaPromptGraph(llms=llms)