File size: 18,894 Bytes
76adccc
336eca2
76adccc
 
 
 
 
 
 
 
 
56ef8de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76adccc
 
f9ab7f7
56ef8de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9ab7f7
56ef8de
 
 
 
f9ab7f7
 
 
 
 
ceeb5d2
 
 
 
 
4937eee
f9ab7f7
 
 
 
ceeb5d2
 
 
 
 
4937eee
f9ab7f7
 
 
76adccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceeb5d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76adccc
 
 
 
ceeb5d2
aaa4147
ceeb5d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76adccc
 
 
 
ceeb5d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4937eee
76adccc
 
 
 
 
 
f9ab7f7
 
 
76adccc
 
 
 
 
 
4937eee
76adccc
 
 
 
56ef8de
 
 
 
 
 
 
 
 
 
336eca2
 
 
 
 
 
 
 
 
 
76adccc
ceeb5d2
 
336eca2
 
 
 
 
 
 
 
 
 
76adccc
 
 
 
 
 
f9ab7f7
 
76adccc
 
f9ab7f7
 
 
 
 
 
 
 
76adccc
 
 
 
 
 
 
 
 
f9ab7f7
76adccc
f9ab7f7
 
76adccc
f9ab7f7
 
 
 
 
 
 
 
 
 
 
 
 
76adccc
f9ab7f7
 
 
76adccc
f9ab7f7
 
76adccc
f9ab7f7
 
 
76adccc
 
 
 
 
 
 
 
 
 
 
 
 
 
f9ab7f7
 
 
 
 
 
 
4937eee
f9ab7f7
76adccc
 
 
 
 
f9ab7f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4937eee
f9ab7f7
 
 
 
 
 
 
76adccc
4937eee
 
 
 
 
 
 
 
 
 
 
 
76adccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import json
from openai import BadRequestError
import yaml
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import YamlOutputParser

# Define prompt strings as constants
DESCRIPTION_PROMPT = [
    ("system", """{{
  "task_description": "Given a JSON example for a task type, provide a concise description of the task type, including the format and style of the input and output. If there are multiple examples, provide an overall description and ignore unique parts. Output a JSON object.",
  "requirements": [
    "Analyze the provided JSON example(s) to understand the task type",
    "Focus on the common format and style across examples, if multiple are given",
    "Ignore any unique parts that do not generalize across examples",
    "Provide a concise description summarizing the key aspects of the task type"
  ],
  "output_format": {{
    "type": "object",
    "properties": {{
      "description": {{
        "type": "string",
        "description": "A concise description of the task type, including input and output format and style"
      }}
    }},
    "required": ["description"]
  }},
  "output_example": {{
    "description": "This task involves analyzing financial reports in JSON format to calculate key metrics and generate a summary report. The input JSON contains fields like revenue, expenses, and dates, while the output is a JSON object with the calculated metrics and summary."
  }}
}}
"""),
    ("user", """{raw_example}""")
]

DESCRIPTION_UPDATING_PROMPT = [
    ("system", """{{
"task_description": "Given the task type description and suggestions, update the task type description according to the suggestions. Output a JSON object.",
"requirements": [
"Carefully read and understand the provided task type description and suggestions",
"Identify the core elements and characteristics of the task",
"Consider possible generalization dimensions such as task domain, complexity, input/output format, application scenarios, etc.",
"Apply the suggestions to update the task description without changing anything that is not suggested",
"Ensure the updated description is clear, specific, and directly related to the task",
"Provide at least 5 specification suggestions across different dimensions"
],
"output_format": {{
"type": "object",
"properties": {{
"description": {{
"type": "string",
"description": "The updated task type description based on the provided suggestions"
}}
}},
"required": ["description"]
}},
"output_example": {{
"description": "An example of an updated task type description based on the provided suggestions"
}}
}}
"""),
    ("user", """{{
"task_description": "{description}",
"suggestions": "{suggestions}"
}}
""")
]


SPECIFICATION_SUGGESTIONS_PROMPT = [
    ("system", """{{
  "prompt": "Generate suggestions to narrow the task scope for a given task type and example:\n\n1. Analyze the task description and input/output examples.\n2. Identify 3~5 relevant dimensions (e.g., purpose, input/output format, language, steps, criteria, constraints).\n3. Create 3~5 actionable suggestions (no more than 20 words for each) to narrow the task scope based on the above dimensions. Make sure the suggestions are compatible with the provided example.\n4. Start each suggestion with a verb.\n5. Output in JSON format, following `output_format`.\n", 
  "output_format": "{{\n  \"dimensions\": [\n    {{ \"dimension\": \"...\" }},\n    {{ \"dimension\": \"...\" }}\n  ],\n  \"suggestions\": [\n    {{ \"suggestion\": \"...\" }},\n    {{ \"suggestion\": \"...\" }}\n  ]\n}}\n", 
  "task_description": "\n{description}\n", 
  "examples": "\n{raw_example}\n"
}}
""")
]

GENERALIZATION_SUGGESTIONS_PROMPT = [
    ("system", """{{
  "prompt": "Generate task generalization suggestions for a given task type and example:\n\n1. Analyze the task description and input/output examples.\n2. Identify 3~5 relevant dimensions (e.g., purpose, input/output format, language, steps, criteria, constraints).\n3. Create 3~5 actionable suggestions (no more than 20 words for each) to expand the scope of the task based on the above dimensions. Make sure the suggestions are compatible with the provided example.\n4. Start each suggestion with a verb.\n5. Output in JSON format, following `output_format`.\n", 
  "output_format": "{{\n  \"dimensions\": [\n    {{ \"dimension\": \"...\" }},\n    {{ \"dimension\": \"...\" }}\n  ],\n  \"suggestions\": [\n    {{ \"suggestion\": \"...\" }},\n    {{ \"suggestion\": \"...\" }}\n  ]\n}}\n", 
  "task_description": "\n{description}\n", 
  "examples": "\n{raw_example}\n"
}}
""")
]

INPUT_ANALYSIS_PROMPT = [
    ("system", """For the specific task type, analyze the possible task inputs across multiple dimensions.
     
Conduct a detailed analysis and enumerate:

1. Core Attributes: Identify the fundamental properties or characteristics of this input type.
1. Variation Dimensions: For each dimension that may vary, specify:
   - Dimension name
   - Possible range of values or options
   - Impact on input nature or task difficulty
1. Constraints: List any rules or limitations that must be adhered to.
1. Edge Cases: Describe extreme or special scenarios that may test the robustness of task processing.
1. External Factors: Enumerate factors that might influence input generation or task completion.
1. Potential Extensions: Propose ways to expand or modify this input type to create new variants.

Format your response as follows:
Input Analysis: [Your analysis here]
"""),
    ("user", """Task Description:

{description}

""")
]

BRIEFS_PROMPT = [
    ("system", """{{
  "prompt": "Given the task type description, and input analysis, generate descriptions for {generating_batch_size} new examples with detailed attributes based on this task type. But don't provide any detailed task output.\n\nUse the input analysis to create diverse and comprehensive example briefs that cover various input dimensions and attribute ranges.\n\nFormat your response as a JSON object following `output_format`.",
  "output_format": "{{
    "new_example_briefs": [
      {{
        "example_brief": "..."
      }},
      {{
        "example_brief": "..."
      }},
      ...
    ]
  }},
  "task_description": "{description}",
  "input_analysis": "{input_analysis}",
  "generating_batch_size": "{generating_batch_size}"
}}
""")
]

EXAMPLES_FROM_BRIEFS_PROMPT = [
    ("system", """{{
  "prompt": "Given the task type description, brief descriptions for new examples, and JSON example(s), generate {generating_batch_size} more input/output examples for this task type, strictly following the brief descriptions and task type description. Ensure that the new examples are consistent with the brief descriptions and do not introduce any new information not present in the briefs. Output in JSON format, following `output_format`. Validate the generated new examples with the task type description and brief descriptions.",
  "output_format": "{{
    "examples": [
      {{
        "input": "...",
        "output": "..."
      }},
      {{
        "input": "...",
        "output": "..."
      }},
      ...
    ]
  }},
  "task_description": "{description}",
  "new_example_briefs": {new_example_briefs},
  "raw_example": "{raw_example}"
}}
""")
]

EXAMPLES_DIRECTLY_PROMPT = [
    ("system", """{{
  "prompt": "Given the task type description, and input/output example(s), generate {generating_batch_size} new input/output examples for this task type. Output in JSON format, following `output_format`.",
  "output_format": "{{
    "examples": [
      {{
        "input": "...",
        "output": "..."
      }},
      {{
        "input": "...",
        "output": "..."
      }},
      ...
    ]
  }},
  "task_description": "{description}",
  "examples": "{raw_example}"
}}
""")
]

class TaskDescriptionGenerator:
    def __init__(self, model):        
        self.description_prompt = ChatPromptTemplate.from_messages(DESCRIPTION_PROMPT)
        self.description_updating_prompt = ChatPromptTemplate.from_messages(DESCRIPTION_UPDATING_PROMPT)
        self.specification_suggestions_prompt = ChatPromptTemplate.from_messages(SPECIFICATION_SUGGESTIONS_PROMPT)
        self.generalization_suggestions_prompt = ChatPromptTemplate.from_messages(GENERALIZATION_SUGGESTIONS_PROMPT)
        self.input_analysis_prompt = ChatPromptTemplate.from_messages(INPUT_ANALYSIS_PROMPT)
        self.briefs_prompt = ChatPromptTemplate.from_messages(BRIEFS_PROMPT)
        self.examples_from_briefs_prompt = ChatPromptTemplate.from_messages(EXAMPLES_FROM_BRIEFS_PROMPT)
        self.examples_directly_prompt = ChatPromptTemplate.from_messages(EXAMPLES_DIRECTLY_PROMPT)

        json_model = model.bind(response_format={"type": "json_object"})
        # json_model = model

        output_parser = StrOutputParser()
        json_parse = JsonOutputParser()

        self.description_chain = (self.description_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"description": ""})]) | (lambda x: x["description"])
        self.description_updating_chain = (self.description_updating_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"description": ""})]) | (lambda x: x["description"])
        self.specification_suggestions_chain = (self.specification_suggestions_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"dimensions": [], "suggestions": []})])
        self.generalization_suggestions_chain = (self.generalization_suggestions_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"dimensions": [], "suggestions": []})])
        self.input_analysis_chain = self.input_analysis_prompt | model | output_parser
        # self.briefs_chain = self.briefs_prompt | model | output_parser
        self.briefs_chain = self.briefs_prompt | json_model | json_parse | RunnableLambda(lambda x: x["new_example_briefs"])
        self.examples_from_briefs_chain = (self.examples_from_briefs_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"examples": []})])
        self.examples_directly_chain = (self.examples_directly_prompt | json_model | json_parse).with_retry(
            retry_if_exception_type=(BadRequestError,), # Retry only on ValueError
            wait_exponential_jitter=True, # Add jitter to the exponential backoff
            stop_after_attempt=2 # Try twice
        ).with_fallbacks([RunnableLambda(lambda x: {"examples": []})])

        # New sub-chain for loading and validating input
        self.input_loader = RunnableLambda(self.load_and_validate_input)

        self.chain = (
            self.input_loader
            | RunnablePassthrough.assign(raw_example=lambda x: json.dumps(x["example"], ensure_ascii=False))
            | RunnablePassthrough.assign(description=self.description_chain)
            | {
                "description": lambda x: x["description"],
                "examples_from_briefs": RunnablePassthrough.assign(input_analysis=self.input_analysis_chain)
                | RunnablePassthrough.assign(new_example_briefs=self.briefs_chain)
                | RunnablePassthrough.assign(examples=self.examples_from_briefs_chain | (lambda x: x["examples"])),
                "examples_directly": self.examples_directly_chain,
                "suggestions": {
                    "specification": self.specification_suggestions_chain,
                    "generalization": self.generalization_suggestions_chain
                } | RunnableLambda(lambda x: [item['suggestion'] for sublist in [v['suggestions'] for v in x.values()] for item in sublist])
            }
            | RunnablePassthrough.assign(
                additional_examples=lambda x: (
                    list(x["examples_from_briefs"]["examples"])
                    + list(x["examples_directly"]["examples"])
                )
            )
        )

    def parse_input_str(self, input_str):
        try:
            example_dict = json.loads(input_str)
        except ValueError:
            try:
                example_dict = yaml.safe_load(input_str)
            except yaml.YAMLError as e:
                raise ValueError("Invalid input format. Expected a JSON or YAML object.") from e
        
        # If example_dict is a list, filter out invalid items
        if isinstance(example_dict, list):
            example_dict = [item for item in example_dict if isinstance(item, dict) and 'input' in item and 'output' in item]

        # If example_dict is not a list, check if it's a valid dict
        elif not isinstance(example_dict, dict) or 'input' not in example_dict or 'output' not in example_dict:
            raise ValueError("Invalid input format. Expected an object with 'input' and 'output' fields.")
        
        return example_dict

    def load_and_validate_input(self, input_dict):
        input_str = input_dict["input_str"]
        generating_batch_size = input_dict.get("generating_batch_size")

        try:
            example_dict = self.parse_input_str(input_str)
            # Move the original content to a key named 'example'
            input_dict = {"example": example_dict}
            if generating_batch_size is not None:
                input_dict["generating_batch_size"] = generating_batch_size

            return input_dict

        except Exception as e:
            raise RuntimeError(f"An error occurred during processing: {str(e)}")

    def process(self, input_str, generating_batch_size=3):
        input_dict = {"input_str": input_str, "generating_batch_size": generating_batch_size}
        result = self.chain.invoke(input_dict)
        return result

    def generate_description(self, input_str, generating_batch_size=3):
        chain = (
            self.input_loader 
            | RunnablePassthrough.assign(raw_example=lambda x: json.dumps(x["example"], ensure_ascii=False))
            | RunnablePassthrough.assign(description=self.description_chain)
            | {
                "description": lambda x: x["description"],
                "suggestions": {
                    "specification": self.specification_suggestions_chain,
                    "generalization": self.generalization_suggestions_chain
                } | RunnableLambda(lambda x: [item['suggestion'] for sublist in [v['suggestions'] for v in x.values() if 'suggestions' in v] for item in sublist if 'suggestion' in item])
            }
        )
        return chain.invoke({
            "input_str": input_str,
            "generating_batch_size": generating_batch_size
        })
    
    def update_description(self, input_str, description, suggestions):
        # package array suggestions into a JSON array
        suggestions_str = json.dumps(suggestions, ensure_ascii=False)

        # return the updated description with new suggestions
        chain = (
            RunnablePassthrough.assign(
                description=self.description_updating_chain
            )
            | {
                "description": lambda x: x["description"],
                "suggestions": {
                    "specification": self.specification_suggestions_chain,
                    "generalization": self.generalization_suggestions_chain
                } | RunnableLambda(lambda x: [item['suggestion'] for sublist in [v['suggestions'] for v in x.values() if 'suggestions' in v] for item in sublist if 'suggestion' in item])
            }
        )
        return chain.invoke({
            "raw_example": input_str,
            "description": description,
            "suggestions": suggestions_str
        })

    def generate_suggestions(self, input_str, description):
        chain = RunnablePassthrough.assign(
            suggestions={
                "specification": self.specification_suggestions_chain,
                "generalization": self.generalization_suggestions_chain
            } | RunnableLambda(lambda x: [item['suggestion'] for sublist in [v['suggestions'] for v in x.values() if 'suggestions' in v] for item in sublist if 'suggestion' in item])
        )
        return chain.invoke({
            "description": description,
            "raw_example": input_str
        })

    def analyze_input(self, description):
        return self.input_analysis_chain.invoke(description)

    def generate_briefs(self, description, input_analysis, generating_batch_size):
        return self.briefs_chain.invoke({
            "description": description,
            "input_analysis": input_analysis,
            "generating_batch_size": generating_batch_size
        })

    def generate_examples_from_briefs(self, description, new_example_briefs, input_str, generating_batch_size=3):
        chain = (
            self.input_loader
            | RunnablePassthrough.assign(
                raw_example = lambda x: json.dumps(x["example"], ensure_ascii=False),
                description = lambda x: description,
                new_example_briefs = lambda x: new_example_briefs
            )
            | self.examples_from_briefs_chain
        )
        return chain.invoke({
            "description": description,
            "new_example_briefs": new_example_briefs,
            "input_str": input_str,
            "generating_batch_size": generating_batch_size
        })

    def generate_examples_directly(self, description, raw_example, generating_batch_size):
        return self.examples_directly_chain.invoke({
            "description": description,
            "raw_example": raw_example,
            "generating_batch_size": generating_batch_size
        })