Spaces:
Running
Running
Refactor code and update system message formatting guidelines.
Browse files- app/examples/log.csv +1 -0
- app/gradio_meta_prompt.py +23 -32
- meta_prompt/consts.py +9 -6
- tests/meta_prompt_graph_test.py +17 -12
app/examples/log.csv
CHANGED
@@ -329,3 +329,4 @@ Therefore, the area of each triangle is 15 cm^2, 6 cm^2, 8 cm^2, 60 cm^2, and 31
|
|
329 |
* Acceptable differences:
|
330 |
* Different code examples
|
331 |
* Minor text differences","As a Code Tutor Assistant, you are designed to handle advanced-level queries that require expertise in writing and explaining code snippets."
|
|
|
|
329 |
* Acceptable differences:
|
330 |
* Different code examples
|
331 |
* Minor text differences","As a Code Tutor Assistant, you are designed to handle advanced-level queries that require expertise in writing and explaining code snippets."
|
332 |
+
"If it takes 8 bits to make a byte, how many bits are there in a kilobyte?","There are 8,192 bits in a kilobyte. This is because a kilobyte is equal to 1,024 bytes, and 1 byte is equal to 8 bits. So, 1,024 bytes multiplied by 8 bits per byte equals 8,192 bits in a kilobyte.",Exactly format and style match. Consistent semantic. Highly similar text length.,
|
app/gradio_meta_prompt.py
CHANGED
@@ -148,7 +148,13 @@ def evaluate_system_message(system_message, user_message, simple_model, executor
|
|
148 |
("human", "{user_message}")
|
149 |
])
|
150 |
messages = template.format_messages(system_message=system_message, user_message=user_message)
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
if hasattr(output, 'content'):
|
154 |
return output.content
|
@@ -160,7 +166,6 @@ def process_message(user_message, expected_output, acceptance_criteria,
|
|
160 |
initial_system_message, recursion_limit: int,
|
161 |
max_output_age: int,
|
162 |
llms: Union[BaseLanguageModel, Dict[str, BaseLanguageModel]]):
|
163 |
-
# Create the input state
|
164 |
input_state = AgentState(
|
165 |
user_message=user_message,
|
166 |
expected_output=expected_output,
|
@@ -169,49 +174,35 @@ def process_message(user_message, expected_output, acceptance_criteria,
|
|
169 |
max_output_age=max_output_age
|
170 |
)
|
171 |
|
172 |
-
# Get the output state from MetaPromptGraph
|
173 |
log_stream = io.StringIO()
|
174 |
-
|
175 |
-
|
176 |
-
if
|
177 |
-
log_handler = logging.StreamHandler(log_stream)
|
178 |
-
logger = logging.getLogger(MetaPromptGraph.__name__)
|
179 |
log_handler.setFormatter(jsonlogger.JsonFormatter(
|
180 |
'%(asctime)s %(name)s %(levelname)s %(message)s'))
|
181 |
logger.addHandler(log_handler)
|
182 |
|
183 |
-
meta_prompt_graph = MetaPromptGraph(
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
-
if
|
188 |
log_handler.close()
|
189 |
log_output = log_stream.getvalue()
|
190 |
else:
|
191 |
log_output = None
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
analysis = ''
|
197 |
-
|
198 |
-
if 'best_system_message' in output_state and output_state['best_system_message'] is not None:
|
199 |
-
system_message = output_state['best_system_message']
|
200 |
-
else:
|
201 |
-
system_message = "Error: The output state does not contain a valid 'best_system_message'"
|
202 |
-
|
203 |
-
if 'best_output' in output_state and output_state['best_output'] is not None:
|
204 |
-
output = output_state["best_output"]
|
205 |
-
else:
|
206 |
-
output = "Error: The output state does not contain a valid 'best_output'"
|
207 |
|
208 |
-
|
209 |
-
analysis = output_state['analysis']
|
210 |
-
else:
|
211 |
-
analysis = "Error: The output state does not contain a valid 'analysis'"
|
212 |
|
213 |
-
return (system_message, output, analysis,
|
214 |
-
chat_log_2_chatbot_list(log_output))
|
215 |
|
216 |
|
217 |
def process_message_with_single_llm(user_message, expected_output, acceptance_criteria, initial_system_message,
|
|
|
148 |
("human", "{user_message}")
|
149 |
])
|
150 |
messages = template.format_messages(system_message=system_message, user_message=user_message)
|
151 |
+
try:
|
152 |
+
output = llm.invoke(messages)
|
153 |
+
except Exception as e:
|
154 |
+
if isinstance(e, gr.Error):
|
155 |
+
raise e
|
156 |
+
else:
|
157 |
+
raise gr.Error(f"Error: {e}")
|
158 |
|
159 |
if hasattr(output, 'content'):
|
160 |
return output.content
|
|
|
166 |
initial_system_message, recursion_limit: int,
|
167 |
max_output_age: int,
|
168 |
llms: Union[BaseLanguageModel, Dict[str, BaseLanguageModel]]):
|
|
|
169 |
input_state = AgentState(
|
170 |
user_message=user_message,
|
171 |
expected_output=expected_output,
|
|
|
174 |
max_output_age=max_output_age
|
175 |
)
|
176 |
|
|
|
177 |
log_stream = io.StringIO()
|
178 |
+
logger = logging.getLogger(MetaPromptGraph.__name__) if config.verbose else None
|
179 |
+
log_handler = logging.StreamHandler(log_stream) if logger else None
|
180 |
+
if log_handler:
|
|
|
|
|
181 |
log_handler.setFormatter(jsonlogger.JsonFormatter(
|
182 |
'%(asctime)s %(name)s %(levelname)s %(message)s'))
|
183 |
logger.addHandler(log_handler)
|
184 |
|
185 |
+
meta_prompt_graph = MetaPromptGraph(llms=llms, verbose=config.verbose, logger=logger)
|
186 |
+
try:
|
187 |
+
output_state = meta_prompt_graph(input_state, recursion_limit=recursion_limit)
|
188 |
+
except Exception as e:
|
189 |
+
if isinstance(e, gr.Error):
|
190 |
+
raise e
|
191 |
+
else:
|
192 |
+
raise gr.Error(f"Error: {e}")
|
193 |
|
194 |
+
if log_handler:
|
195 |
log_handler.close()
|
196 |
log_output = log_stream.getvalue()
|
197 |
else:
|
198 |
log_output = None
|
199 |
|
200 |
+
system_message = output_state.get('best_system_message', "Error: The output state does not contain a valid 'best_system_message'")
|
201 |
+
output = output_state.get('best_output', "Error: The output state does not contain a valid 'best_output'")
|
202 |
+
analysis = output_state.get('analysis', "Error: The output state does not contain a valid 'analysis'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
return (system_message, output, analysis, chat_log_2_chatbot_list(log_output))
|
|
|
|
|
|
|
205 |
|
|
|
|
|
206 |
|
207 |
|
208 |
def process_message_with_single_llm(user_message, expected_output, acceptance_criteria, initial_system_message,
|
meta_prompt/consts.py
CHANGED
@@ -28,7 +28,8 @@ You are an expert prompt engineer tasked with creating system messages for AI as
|
|
28 |
2. Ensure the system message can handle similar user messages.
|
29 |
3. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
|
30 |
4. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
|
31 |
-
5.
|
|
|
32 |
|
33 |
## Output
|
34 |
|
@@ -59,8 +60,9 @@ You are an expert prompt engineer tasked with updating system messages for AI as
|
|
59 |
4. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
|
60 |
5. Avoiding the behavior should be explicitly requested (e.g. `Don't ...`) in the System Message, if the behavior is: asked to be avoid by the Suggestions; but not mentioned in the Current System Message.
|
61 |
6. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
|
62 |
-
7.
|
63 |
-
8.
|
|
|
64 |
|
65 |
## Output
|
66 |
|
@@ -91,12 +93,12 @@ Provide only the updated System Message, adhering to the above guidelines.
|
|
91 |
("human", "{user_message}")
|
92 |
]),
|
93 |
NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
|
94 |
-
("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the
|
95 |
|
96 |
You output the following analysis according to the Acceptance Criteria:
|
97 |
|
98 |
* Your analysis in a Markdown list.
|
99 |
-
* Indicates an output ID that is more consistent with the
|
100 |
|
101 |
```
|
102 |
# Analysis
|
@@ -106,7 +108,7 @@ You output the following analysis according to the Acceptance Criteria:
|
|
106 |
# Preferred Output ID: [ID]
|
107 |
```
|
108 |
|
109 |
-
If both outputs are equally similar to the
|
110 |
|
111 |
```
|
112 |
# Analysis
|
@@ -191,6 +193,7 @@ Provide your analysis in the following format:
|
|
191 |
* Provide your suggestions in a Markdown list, nothing else. Output only the suggestions related with Unacceptable Differences.
|
192 |
* Start every suggestion with `The System Message should ...`.
|
193 |
* Figue out the contexts of the System Message that conflict with the suggestions, and suggest modification or deletion.
|
|
|
194 |
* Avoiding the behavior should be explicitly requested (e.g. `The System Message should explicitly state that the output shoud not ...`) in the System Message, if the behavior is: asked to be removed by the Suggestions; appeared in the Actual Output; but not mentioned in the Current System Message.
|
195 |
* Expected Output text should not appear in System Message as an example. But it's OK to use some similar but distinct text as an example instead.
|
196 |
* Ask to remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
|
|
|
28 |
2. Ensure the system message can handle similar user messages.
|
29 |
3. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
|
30 |
4. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
|
31 |
+
5. In the System Message, do not use `Expected Output` to refer to the example you want to illustrate. Instead, directly describe the specific features you need.
|
32 |
+
6. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
|
33 |
|
34 |
## Output
|
35 |
|
|
|
60 |
4. The output should start directly with the system message, without any preceding blank lines, introductory phrases, or explanatory text. Do not include extra lines at the beginning or end of the output.
|
61 |
5. Avoiding the behavior should be explicitly requested (e.g. `Don't ...`) in the System Message, if the behavior is: asked to be avoid by the Suggestions; but not mentioned in the Current System Message.
|
62 |
6. Expected Output text should not appear in System Message as an example. But it's OK to use some similar text as an example instead.
|
63 |
+
7. In the System Message, do not use `Expected Output` to refer to the example you want to illustrate. Instead, directly describe the specific features you need.
|
64 |
+
8. Remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
|
65 |
+
9. Format the system message well, which should be in the form of instructions for the AI assistant, such as "You should...". Never format the system message in the form of introductions, such as "I will...".
|
66 |
|
67 |
## Output
|
68 |
|
|
|
93 |
("human", "{user_message}")
|
94 |
]),
|
95 |
NODE_OUTPUT_HISTORY_ANALYZER: ChatPromptTemplate.from_messages([
|
96 |
+
("system", """You are a text comparing program. You read the Acceptance Criteria, compare the compare the Expected Output with two different outputs, and decide which one is more consistent with the Expected Output. When comparing the outputs, ignore the differences which are acceptable or ignorable according to the Acceptance Criteria.
|
97 |
|
98 |
You output the following analysis according to the Acceptance Criteria:
|
99 |
|
100 |
* Your analysis in a Markdown list.
|
101 |
+
* Indicates an output ID that is more consistent with the Expected Output, in the following format:
|
102 |
|
103 |
```
|
104 |
# Analysis
|
|
|
108 |
# Preferred Output ID: [ID]
|
109 |
```
|
110 |
|
111 |
+
If both outputs are equally similar to the Expected Output, output the following:
|
112 |
|
113 |
```
|
114 |
# Analysis
|
|
|
193 |
* Provide your suggestions in a Markdown list, nothing else. Output only the suggestions related with Unacceptable Differences.
|
194 |
* Start every suggestion with `The System Message should ...`.
|
195 |
* Figue out the contexts of the System Message that conflict with the suggestions, and suggest modification or deletion.
|
196 |
+
* Do not simply describe the output as being the same/similar/different from the Expected Output, such as `the output should not use a different format and style compared to the Expected Output` or `the output should match the expected output exactly`; instead, describe the expected characteristics specifically and suggest a detailed example.
|
197 |
* Avoiding the behavior should be explicitly requested (e.g. `The System Message should explicitly state that the output shoud not ...`) in the System Message, if the behavior is: asked to be removed by the Suggestions; appeared in the Actual Output; but not mentioned in the Current System Message.
|
198 |
* Expected Output text should not appear in System Message as an example. But it's OK to use some similar but distinct text as an example instead.
|
199 |
* Ask to remove the Expected Output text or text highly similar to Expected Output from System Message, if it's present.
|
tests/meta_prompt_graph_test.py
CHANGED
@@ -4,11 +4,11 @@ import logging
|
|
4 |
from unittest.mock import MagicMock
|
5 |
from unittest.mock import patch
|
6 |
|
7 |
-
# Assuming the necessary imports are made for the classes and functions used in meta_prompt_graph.py
|
8 |
-
from meta_prompt import AgentState, MetaPromptGraph
|
9 |
-
|
10 |
from langchain_openai import ChatOpenAI
|
11 |
|
|
|
|
|
|
|
12 |
class TestMetaPromptGraph(unittest.TestCase):
|
13 |
def setUp(self):
|
14 |
# logging.basicConfig(level=logging.DEBUG)
|
@@ -16,7 +16,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
16 |
|
17 |
def test_prompt_node(self):
|
18 |
llms = {
|
19 |
-
|
20 |
invoke=MagicMock(return_value=MagicMock(content="Mocked response content"))
|
21 |
)
|
22 |
}
|
@@ -29,7 +29,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
29 |
|
30 |
# Invoke the _prompt_node method with the mock node, target attribute, and state
|
31 |
updated_state = graph._prompt_node(
|
32 |
-
|
33 |
)
|
34 |
|
35 |
# Assertions
|
@@ -70,7 +70,7 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
70 |
|
71 |
def test_prompt_analyzer_accept(self):
|
72 |
llms = {
|
73 |
-
|
74 |
invoke=lambda prompt: MagicMock(content="Accept: Yes"))
|
75 |
}
|
76 |
meta_prompt_graph = MetaPromptGraph(llms)
|
@@ -78,6 +78,11 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
78 |
updated_state = meta_prompt_graph._prompt_analyzer(state)
|
79 |
assert updated_state.accepted == True
|
80 |
|
|
|
|
|
|
|
|
|
|
|
81 |
def test_workflow_execution(self):
|
82 |
# MODEL_NAME = "anthropic/claude-3.5-sonnet:beta"
|
83 |
# MODEL_NAME = "meta-llama/llama-3-70b-instruct"
|
@@ -120,12 +125,12 @@ class TestMetaPromptGraph(unittest.TestCase):
|
|
120 |
executor_llm = ChatOpenAI(model_name="meta-llama/llama-3-8b-instruct", temperature=0.01)
|
121 |
|
122 |
llms = {
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
}
|
130 |
|
131 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|
|
|
4 |
from unittest.mock import MagicMock
|
5 |
from unittest.mock import patch
|
6 |
|
|
|
|
|
|
|
7 |
from langchain_openai import ChatOpenAI
|
8 |
|
9 |
+
# Assuming the necessary imports are made for the classes and functions used in meta_prompt_graph.py
|
10 |
+
from meta_prompt import *
|
11 |
+
|
12 |
class TestMetaPromptGraph(unittest.TestCase):
|
13 |
def setUp(self):
|
14 |
# logging.basicConfig(level=logging.DEBUG)
|
|
|
16 |
|
17 |
def test_prompt_node(self):
|
18 |
llms = {
|
19 |
+
NODE_PROMPT_INITIAL_DEVELOPER: MagicMock(
|
20 |
invoke=MagicMock(return_value=MagicMock(content="Mocked response content"))
|
21 |
)
|
22 |
}
|
|
|
29 |
|
30 |
# Invoke the _prompt_node method with the mock node, target attribute, and state
|
31 |
updated_state = graph._prompt_node(
|
32 |
+
NODE_PROMPT_INITIAL_DEVELOPER, "output", state
|
33 |
)
|
34 |
|
35 |
# Assertions
|
|
|
70 |
|
71 |
def test_prompt_analyzer_accept(self):
|
72 |
llms = {
|
73 |
+
NODE_PROMPT_ANALYZER: MagicMock(
|
74 |
invoke=lambda prompt: MagicMock(content="Accept: Yes"))
|
75 |
}
|
76 |
meta_prompt_graph = MetaPromptGraph(llms)
|
|
|
78 |
updated_state = meta_prompt_graph._prompt_analyzer(state)
|
79 |
assert updated_state.accepted == True
|
80 |
|
81 |
+
def test_get_node_names(self):
|
82 |
+
graph = MetaPromptGraph()
|
83 |
+
node_names = graph.get_node_names()
|
84 |
+
self.assertEqual(node_names, META_PROMPT_NODES)
|
85 |
+
|
86 |
def test_workflow_execution(self):
|
87 |
# MODEL_NAME = "anthropic/claude-3.5-sonnet:beta"
|
88 |
# MODEL_NAME = "meta-llama/llama-3-70b-instruct"
|
|
|
125 |
executor_llm = ChatOpenAI(model_name="meta-llama/llama-3-8b-instruct", temperature=0.01)
|
126 |
|
127 |
llms = {
|
128 |
+
NODE_PROMPT_INITIAL_DEVELOPER: optimizer_llm,
|
129 |
+
NODE_PROMPT_DEVELOPER: optimizer_llm,
|
130 |
+
NODE_PROMPT_EXECUTOR: executor_llm,
|
131 |
+
NODE_OUTPUT_HISTORY_ANALYZER: optimizer_llm,
|
132 |
+
NODE_PROMPT_ANALYZER: optimizer_llm,
|
133 |
+
NODE_PROMPT_SUGGESTER: optimizer_llm
|
134 |
}
|
135 |
|
136 |
meta_prompt_graph = MetaPromptGraph(llms=llms)
|