File size: 9,163 Bytes
76adccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import json
import yaml
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import YamlOutputParser

# Define prompt strings as constants
DESCRIPTION_PROMPT = [
    ("system", """Given the JSON example(s) for a task type:
     
{raw_example}

Provide a concise description of the task type, including the format and style
of the input and output. If there are multiple examples, provide an overall
description and ignore unique parts.

Format your response as follows:
Task Description: [Your description here]
""")
]

INPUT_ANALYSIS_PROMPT = [
    ("system", """For the specific task type, analyze the possible task inputs across multiple dimensions.
     
Conduct a detailed analysis and enumerate:

1. Core Attributes: Identify the fundamental properties or characteristics of this input type.
1. Variation Dimensions: For each dimension that may vary, specify:
   - Dimension name
   - Possible range of values or options
   - Impact on input nature or task difficulty
1. Constraints: List any rules or limitations that must be adhered to.
1. Edge Cases: Describe extreme or special scenarios that may test the robustness of task processing.
1. External Factors: Enumerate factors that might influence input generation or task completion.
1. Potential Extensions: Propose ways to expand or modify this input type to create new variants.

Format your response as follows:
Input Analysis: [Your analysis here]
"""),
    ("user", """Task Description:

{description}

""")
]

BRIEFS_PROMPT = [
    ("system", """Given the task type description, and input analysis, generate
descriptions for {generating_batch_size} new examples with detailed attributes
based on this task type. But don't provide any detailed task output.

Use the input analysis to create diverse and comprehensive example briefs that
cover various input dimensions and attribute ranges.

Format your response as a valid YAML object with a single key 'new_example_briefs'
containing a YAML array of {generating_batch_size} objects, each with a
'example_brief' field.
"""),
    ("user", """Task Description:

{description}

Input Analysis:

{input_analysis}

""")
]

EXAMPLES_FROM_BRIEFS_PROMPT = [
    ("system", """Given the task type description, brief descriptions for new examples, 
and JSON example(s), generate {generating_batch_size} more input/output examples for this task type,
strictly based on the brief descriptions. Ensure that the new examples are
consistent with the brief descriptions and do not introduce any new information
not present in the briefs.

Format your response as a valid JSON object with a single key 'examples' 
containing a JSON array of {generating_batch_size} objects, each with 'input' and 'output' fields.
"""),
    ("user", """Task Description:

{description}

New Example Briefs: 

{new_example_briefs}

Example(s):

{raw_example}

""")
]

EXAMPLES_DIRECTLY_PROMPT = [
    ("system", """Given the task type description, and input/output example(s), generate {generating_batch_size}
new input/output examples for this task type.

Format your response as a valid JSON object with a single key 'examples' 
containing a JSON array of {generating_batch_size} objects, each with 'input' and 'output' fields.
"""),
    ("user", """Task Description:

{description}

Example(s):

{raw_example}

""")
]


class TaskDescriptionGenerator:
    def __init__(self, model):        
        self.description_prompt = ChatPromptTemplate.from_messages(DESCRIPTION_PROMPT)
        self.input_analysis_prompt = ChatPromptTemplate.from_messages(INPUT_ANALYSIS_PROMPT)
        self.briefs_prompt = ChatPromptTemplate.from_messages(BRIEFS_PROMPT)
        self.examples_from_briefs_prompt = ChatPromptTemplate.from_messages(EXAMPLES_FROM_BRIEFS_PROMPT)
        self.examples_directly_prompt = ChatPromptTemplate.from_messages(EXAMPLES_DIRECTLY_PROMPT)

        json_model = model.bind(response_format={"type": "json_object"})

        output_parser = StrOutputParser()
        json_parse = JsonOutputParser()

        self.description_chain = self.description_prompt | model | output_parser
        self.input_analysis_chain = self.input_analysis_prompt | model | output_parser
        self.briefs_chain = self.briefs_prompt | model | output_parser
        self.examples_from_briefs_chain = self.examples_from_briefs_prompt | json_model | json_parse
        self.examples_directly_chain = self.examples_directly_prompt | json_model | json_parse

        # New sub-chain for loading and validating input
        self.input_loader = RunnableLambda(self.load_and_validate_input)

        self.chain = (
            self.input_loader
            | RunnablePassthrough.assign(raw_example = lambda x: json.dumps(x["example"], ensure_ascii=False))
            | RunnablePassthrough.assign(description = self.description_chain)
            | {
                "description": lambda x: x["description"],
                "examples_from_briefs": RunnablePassthrough.assign(input_analysis = self.input_analysis_chain)
                    | RunnablePassthrough.assign(new_example_briefs = self.briefs_chain) 
                    | RunnablePassthrough.assign(examples = self.examples_from_briefs_chain | (lambda x: x["examples"])),
                "examples_directly": self.examples_directly_chain
            }
            | RunnablePassthrough.assign(
                additional_examples=lambda x: (
                    list(x["examples_from_briefs"]["examples"])
                    + list(x["examples_directly"]["examples"])
                )
            )
        )

    def load_and_validate_input(self, input_dict):
        input_str = input_dict["input_str"]
        generating_batch_size = input_dict["generating_batch_size"]

        try:
            try:
                example_dict = json.loads(input_str)
            except ValueError:
                try:
                    example_dict = yaml.safe_load(input_str)
                except yaml.YAMLError as e:
                    raise ValueError("Invalid input format. Expected a JSON or YAML object.") from e

            # If example_dict is a list, filter out invalid items
            if isinstance(example_dict, list):
                example_dict = [item for item in example_dict if isinstance(item, dict) and 'input' in item and 'output' in item]

            # If example_dict is not a list, check if it's a valid dict
            elif not isinstance(example_dict, dict) or 'input' not in example_dict or 'output' not in example_dict:
                raise ValueError("Invalid input format. Expected an object with 'input' and 'output' fields.")

            # Move the original content to a key named 'example'
            input_dict = {"example": example_dict, "generating_batch_size": generating_batch_size}

            return input_dict

        except Exception as e:
            raise RuntimeError(f"An error occurred during processing: {str(e)}")

    def process(self, input_str, generating_batch_size=3):
        input_dict = {"input_str": input_str, "generating_batch_size": generating_batch_size}
        result = self.chain.invoke(input_dict)
        return result

    def generate_description(self, input_str, generating_batch_size=3):
        chain = (
            self.input_loader 
            | RunnablePassthrough.assign(raw_example = lambda x: json.dumps(x["example"], ensure_ascii=False))
            | self.description_chain
        )
        return chain.invoke({
            "input_str": input_str,
            "generating_batch_size": generating_batch_size
        })

    def analyze_input(self, description):
        return self.input_analysis_chain.invoke(description)

    def generate_briefs(self, description, input_analysis, generating_batch_size):
        return self.briefs_chain.invoke({
            "description": description,
            "input_analysis": input_analysis,
            "generating_batch_size": generating_batch_size
        })

    def generate_examples_from_briefs(self, description, new_example_briefs, input_str, generating_batch_size=3):
        chain = (
            self.input_loader
            | RunnablePassthrough.assign(
                raw_example = lambda x: json.dumps(x["example"], ensure_ascii=False),
                description = lambda x: description,
                new_example_briefs = lambda x: new_example_briefs
            )
            | self.examples_from_briefs_chain
        )
        return chain.invoke({
            "description": description,
            "new_example_briefs": new_example_briefs,
            "input_str": input_str,
            "generating_batch_size": generating_batch_size
        })

    def generate_examples_directly(self, description, raw_example, generating_batch_size):
        return self.examples_directly_chain.invoke({
            "description": description,
            "raw_example": raw_example,
            "generating_batch_size": generating_batch_size
        })