File size: 13,219 Bytes
4067b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1155ac
 
 
 
 
 
 
 
4067b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1155ac
 
 
 
 
4067b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1155ac
 
 
 
 
 
4067b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import re
from src.models.workflow_graph import Edge, Node, Graph

class HelperClass:

    @staticmethod
    def _build_prompt(project_desc: str, modules: list) -> str:
        return f'''
                You are an advanced AI tasked with constructing a directed graph/flow based on a set of available modules and a project description. Each module in the flow represents a node, and each edge defines the task connecting these nodes. 
                Your output should adhere strictly to the following rules: Dont give me any code and dont mention 'json' at the top of the response.
                There should not be any extra output (even a single word) besides the output required.

                The flow of nodes and tasks must be determined by analyzing the provided project description.
                The modules chosen must form a complete pipeline suitable for the tasks in the project description.



                -Steps-
                1. Parse the project description to identify the tasks and operations required to form the flow.
                - For each task, determine which module (from the available list) best fits the task description.
                - Assign a unique identifier to every instance of a module.
                - Example: If the "Train" module is used twice for lets say training 2 different model, name them both Train with a unique id to both of them.
                - For each identified node:
                - Node ID: Generate a Unique identifier for the module instance (5 digit random string and integer combined and in lower case).
                - Module Name: Name of the module from the available list.

                Format each node as:
                <unique Node ID>Module Name</unique Node ID>

                2. Construct Edges Between Nodes
                - Determine the logical sequence of tasks from the project description.
                - Identify source and target modules for each transition based on the task flow.
                - For each connection, output the following information:
                - Source Node: The unique ID of the starting module. ( Used the ids for each module generated in Step 1)
                - Target Node: The unique ID of the destination module. ( Used the ids for each module generated in Step 1)
                - Task Description: A short descripiton of what is happening during the transition.

                Format each edge as:
                <Edge index>( sourceNode="<Node ID>" | targetNode="<Node ID>" | task="<Task Description>" )<


                ######################
                -Examples-
                ######################
                Example 1:

                Input: Project Description: 

                This project implements an automated quality control system for manufacturing using a modular machine learning pipeline. Data from high-resolution product images and metadata is ingested and augmented to enhance diversity and balance. 
                Task A trains a CNN for defect detection, while Task B trains a transformer-based model for quality classification. Both models are rigorously evaluated and compared against predefined benchmarks. Successful models are deployed for real-time defect monitoring and automated grading via integration with production and ERP systems. 

                Input: Available Modules: 

                ['IngestData',
                'AugmentData',
                'GenerateData',
                'SearchData',
                'Train',
                'Evaluate',
                'TriggerDeployment',
                'ComparePerformance']

                -------------------------------------

                Flow Generated by LLM: ( This will not be Input )

                IngestData -> AugmentData
                AugmentData -> Train (for Task A)
                AugmentData -> Train (for Task B)
                Train (model A) -> Evaluate (test model A)
                Train (model B) -> Evaluate (test model B)
                Evaluate (test model A) -> ComparePerformance
                Evaluate (test model B) -> ComparePerformance
                ComparePerformance -> TriggerDeployment


                ################

                Output:

                <p83fd>IngestData</p83fd>
                <sb9ba>AugmentData</sb9ba>
                <bxt2w>Train A</bxt2w>
                <d1ep3>Train B</d1ep3>
                <b9lca>Evaluate A</b9lca>
                <5w01f>Evaluate B</5w01f>
                <z4bun>ComparePerformance</z4bun>
                <zj2pb>TriggerDeployment</zj2pb>
                <Edge 1>( sourceNode="<p83fd>" | targetNode="<sb9ba>" | task = "Ingesting Data to Augment Data"  )</Edge 1>
                <Edge 2>( sourceNode="<sb9ba>" | targetNode="<bxt2w>" | task = "Augmenting Data to Train model A"  )</Edge 2>
                <Edge 3>( sourceNode="<sb9ba>" | targetNode="<d1ep3>" | task = "Augmenting Data to Train model B"  )</Edge 3>
                <Edge 4>( sourceNode="<bxt2w>" | targetNode="<b9lca>" | task = "Training A to Evaluate model A"  )</Edge 4>
                <Edge 5>( sourceNode="<d1ep3>" | targetNode="<5w01f>" | task = "Training B to Evaluate model B"  )</Edge 5>
                <Edge 6>( sourceNode="<b9lca>" | targetNode="<z4bun>" | task = "Evaluate model A to Compare Performance"  )</Edge 6>
                <Edge 7>( sourceNode="<5w01f>" | targetNode="<z4bun>" | task = "Evaluate model B to Compare Performance"  )</Edge 7>
                <Edge 8>( sourceNode="<z4bun>" | targetNode="<zj2pb>" | task = "Compare Performance to Trigger Deployment"  )</Edge 8>

                #############################

                Example 2:

                Input: Project Description: 

                This project develops an automated crop health monitoring system using a modular machine learning pipeline. Data from satellite and drone imagery is ingested and preprocessed, followed by augmentation techniques to increase diversity and balance. 
                Synthetic data is generated to simulate various crop conditions, enhancing model robustness. The pipeline trains a deep learning model to classify crop health, evaluates its performance on key metrics such as accuracy and recall, and identifies areas for improvement. 
                Once performance benchmarks are met, the system is deployed for real-time crop monitoring, enabling farmers to make informed decisions and optimize agricultural productivity efficiently.

                Input: Available Modules: 

                ['IngestData',
                'AugmentData',
                'GenerateData',
                'SearchData',
                'Train',
                'Evaluate',
                'TriggerDeployment',
                'ComparePerformance']

                -------------------------------------

                Flow Generated by LLM: ( This will not be Input )

                IngestData -> AugmentData
                AugmentData -> GenerateData
                GenerateData -> Train
                Train -> Evaluate
                Evaluate -> TriggerDeployment

                ################

                Output:

                <p001>IngestData</p001>  
                <p002>AugmentData</p002>  
                <p003>GenerateData</p003>  
                <p004>Train</p004>  
                <p005>Evaluate</p005>  
                <p006>TriggerDeployment</p006>  
                <Edge 1>( sourceNode="<p001>" | targetNode="<p002>" | task="Ingesting Data to Augmenting Data" )</Edge 1>  
                <Edge 2>( sourceNode="<p002>" | targetNode="<p003>" | task="Augmenting Data to Generating Synthetic Data" )</Edge 2>  
                <Edge 3>( sourceNode="<p003>" | targetNode="<p004>" | task="Generating Data to Training Model" )</Edge 3>  
                <Edge 4>( sourceNode="<p004>" | targetNode="<p005>" | task="Training Model to Evaluating Performance" )</Edge 4>  
                <Edge 5>( sourceNode="<p005>" | targetNode="<p006>" | task="Evaluating Model to Triggering Deployment" )</Edge 5>  

                #############################

                Example 3:

                Input: Project Descripiont:

                This project implements a robust machine learning pipeline for iterative model improvement. Data is ingested and preprocessed, followed by augmentation to enhance diversity and balance. 
                An initial model is trained on the augmented data. The pipeline then applies further data augmentation techniques tailored to improve underperforming areas, followed by retraining the model for enhanced accuracy. 
                The improved model is rigorously evaluated on a test dataset to ensure it meets predefined performance benchmarks. Upon achieving the desired metrics, the best-performing model is deployed to production, ensuring reliable and efficient real-world performance tailored to the project's objectives.

                Input: Available Modules: 

                ['IngestData',
                'AugmentData',
                'GenerateData',
                'SearchData',
                'Train',
                'Evaluate',
                'TriggerDeployment',
                'ComparePerformance']

                -------------------------------------

                Flow Generated by LLM: ( This will not be Input )

                IngestData -> AugmentData (Stage 1)
                AugmentData (Stage 1) -> Train (Stage 1)
                Train (Stage 1) -> AugmentData (Stage 2)
                AugmentData (Stage 2) -> Train (Stage 2)
                Train (Stage 2) -> Evaluate
                Evaluate -> TriggerDeployment

                ################    

                Output:

                <m001>IngestData</m001>  
                <m002>AugmentData Stage 1</m002>  
                <m003>Train Stage 1</m003>  
                <m004>AugmentData Stage 2</m004>  
                <m005>Train Stage 2</m005>  
                <m006>Evaluate</m006>  
                <m007>TriggerDeployment</m007>  
                <Edge 1>( sourceNode="<m001>" | targetNode="<m002>" | task="Ingesting Data to Augmenting Data Stage 1" )</Edge 1>  
                <Edge 2>( sourceNode="<m002>" | targetNode="<m003>" | task="Augmenting Data Stage 1 to Training Stage 1" )</Edge 2>  
                <Edge 3>( sourceNode="<m003>" | targetNode="<m004>" | task="Training Stage 1 to Augmenting Data Stage 2" )</Edge 3>  
                <Edge 4>( sourceNode="<m004>" | targetNode="<m005>" | task="Augmenting Data Stage 2 to Training Stage 2" )</Edge 4>  
                <Edge 5>( sourceNode="<m005>" | targetNode="<m006>" | task="Training Stage 2 to Evaluating Model" )</Edge 6>  
                <Edge 6>( sourceNode="<m006>" | targetNode="<m007>" | task="Evaluating Model to Triggering Deployment" )</Edge 6>  

                #############################


                When you give output dont mention anything like 'Here is the list of Nodes and Edges extracted from the text:'. Just give the response straight away

                

                -Real Data-
                ######################

                Input: Project Descripion: {project_desc}

                **Instructions**

                1. A list of modules available for building the pipeline. You must only use these modules to form the flow.
                2. Do not Generate New Names for Modules. Only use whatever is available in the list

                Input: Available Modules: 

                {modules}

                ######################

                Output:
        '''

    @staticmethod
    def _parse_llm_response(raw_response: str) -> Graph:

        pattern = r'<([a-zA-Z0-9]+)>([^<]+)<\/\1>|sourceNode="<([^"]+)>"\s*\|\s*targetNode="<([^"]+)>"\s*\|\s*task="([^"]+)"'
        nodes, edges = [], []

        list_ = raw_response.split('\n')

        for line in list_:
            matches = re.findall(pattern, line)

            try:
                for match in matches:
                    if match[0]:

                        nd = Node(node_id=match[0], name=match[1])
                        nodes.append(nd)

                    elif match[2]:

                        edge = Edge(source=match[2], target=match[3], desc=match[4])
                        edges.append(edge)

            except Exception as e:
                print(f"Error parsing line : {line}, error: {e}")

        return Graph(nodes=nodes, edges=edges)

    @staticmethod
    def _store_graph(graph_data: Graph):

        nodes, edges = [], []
        dict_ = {}

        for node in graph_data.nodes:
            dict_[node.node_id] = node.name

            nodes.append({
                'node_id': node.node_id,
                'name': node.name
            })

        dict_['Start'] = 'StartNode'

        for edge in graph_data.edges:
            source_node = dict_[edge.source]
            target_node = dict_[edge.target]
            edges.append({
                'source': edge.source,
                'target': edge.target,
                'desc': edge.desc
            })

        json_obj = {'Nodes': nodes, 'Edges': edges}

helper = HelperClass()