mszel commited on
Commit
76e89e6
·
1 Parent(s): 2148b2a

adding task solver box n xmpl

Browse files
examples/LynxScribe Data Cleaning.lynxkite.json ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "edges": [
3
+ {
4
+ "id": "LynxScribe Task Solver 1 View DataFrame 1",
5
+ "source": "LynxScribe Task Solver 1",
6
+ "sourceHandle": "output",
7
+ "target": "View DataFrame 1",
8
+ "targetHandle": "input"
9
+ },
10
+ {
11
+ "id": "Read Excel 1 LynxScribe Task Solver 1",
12
+ "source": "Read Excel 1",
13
+ "sourceHandle": "output",
14
+ "target": "LynxScribe Task Solver 1",
15
+ "targetHandle": "dataframe"
16
+ },
17
+ {
18
+ "id": "LynxScribe Message 3 LynxScribe Task Solver 1",
19
+ "source": "LynxScribe Message 3",
20
+ "sourceHandle": "output",
21
+ "target": "LynxScribe Task Solver 1",
22
+ "targetHandle": "system_prompt"
23
+ },
24
+ {
25
+ "id": "LynxScribe Message 1 LynxScribe Task Solver 1",
26
+ "source": "LynxScribe Message 1",
27
+ "sourceHandle": "output",
28
+ "target": "LynxScribe Task Solver 1",
29
+ "targetHandle": "instruction_prompt"
30
+ }
31
+ ],
32
+ "env": "LynxScribe",
33
+ "nodes": [
34
+ {
35
+ "data": {
36
+ "__execution_delay": 0.0,
37
+ "collapsed": null,
38
+ "display": null,
39
+ "error": null,
40
+ "input_metadata": null,
41
+ "meta": {
42
+ "inputs": {
43
+ "dataframe": {
44
+ "name": "dataframe",
45
+ "position": "left",
46
+ "type": {
47
+ "type": "<class 'inspect._empty'>"
48
+ }
49
+ },
50
+ "instruction_prompt": {
51
+ "name": "instruction_prompt",
52
+ "position": "bottom",
53
+ "type": {
54
+ "type": "<class 'inspect._empty'>"
55
+ }
56
+ },
57
+ "system_prompt": {
58
+ "name": "system_prompt",
59
+ "position": "bottom",
60
+ "type": {
61
+ "type": "<class 'inspect._empty'>"
62
+ }
63
+ }
64
+ },
65
+ "name": "LynxScribe Task Solver",
66
+ "outputs": {
67
+ "output": {
68
+ "name": "output",
69
+ "position": "right",
70
+ "type": {
71
+ "type": "None"
72
+ }
73
+ }
74
+ },
75
+ "params": {
76
+ "llm_interface": {
77
+ "default": "openai",
78
+ "name": "llm_interface",
79
+ "type": {
80
+ "type": "<class 'str'>"
81
+ }
82
+ },
83
+ "llm_model_name": {
84
+ "default": "gpt-4o",
85
+ "name": "llm_model_name",
86
+ "type": {
87
+ "type": "<class 'str'>"
88
+ }
89
+ },
90
+ "new_column_names": {
91
+ "default": "processed_field",
92
+ "name": "new_column_names",
93
+ "type": {
94
+ "type": "<class 'str'>"
95
+ }
96
+ }
97
+ },
98
+ "type": "basic"
99
+ },
100
+ "params": {
101
+ "llm_interface": "openai",
102
+ "llm_model_name": "gpt-4o",
103
+ "new_column_names": "zip_code, country, state_or_county, city, district, street_name, house_number, floor, flat_number, additional_info"
104
+ },
105
+ "status": "done",
106
+ "title": "LynxScribe Task Solver"
107
+ },
108
+ "dragHandle": ".bg-primary",
109
+ "height": 292.0,
110
+ "id": "LynxScribe Task Solver 1",
111
+ "position": {
112
+ "x": 576.0,
113
+ "y": 209.0
114
+ },
115
+ "type": "basic",
116
+ "width": 432.0
117
+ },
118
+ {
119
+ "data": {
120
+ "display": null,
121
+ "error": null,
122
+ "input_metadata": null,
123
+ "meta": {
124
+ "inputs": {
125
+ "input": {
126
+ "name": "input",
127
+ "position": "left",
128
+ "type": {
129
+ "type": "<class 'inspect._empty'>"
130
+ }
131
+ }
132
+ },
133
+ "name": "View DataFrame",
134
+ "outputs": {},
135
+ "params": {},
136
+ "position": {
137
+ "x": 2162.0,
138
+ "y": 266.0
139
+ },
140
+ "type": "table_view"
141
+ },
142
+ "params": {},
143
+ "status": "done",
144
+ "title": "View DataFrame"
145
+ },
146
+ "dragHandle": ".bg-primary",
147
+ "height": 309.0,
148
+ "id": "View DataFrame 1",
149
+ "position": {
150
+ "x": 1350.0,
151
+ "y": 210.0
152
+ },
153
+ "type": "table_view",
154
+ "width": 662.0
155
+ },
156
+ {
157
+ "data": {
158
+ "__execution_delay": 0.0,
159
+ "collapsed": null,
160
+ "display": null,
161
+ "error": null,
162
+ "input_metadata": null,
163
+ "meta": {
164
+ "inputs": {},
165
+ "name": "Read Excel",
166
+ "outputs": {
167
+ "output": {
168
+ "name": "output",
169
+ "position": "right",
170
+ "type": {
171
+ "type": "None"
172
+ }
173
+ }
174
+ },
175
+ "params": {
176
+ "columns": {
177
+ "default": "",
178
+ "name": "columns",
179
+ "type": {
180
+ "type": "<class 'str'>"
181
+ }
182
+ },
183
+ "file_path": {
184
+ "default": null,
185
+ "name": "file_path",
186
+ "type": {
187
+ "type": "<class 'str'>"
188
+ }
189
+ },
190
+ "sheet_name": {
191
+ "default": "Sheet1",
192
+ "name": "sheet_name",
193
+ "type": {
194
+ "type": "<class 'str'>"
195
+ }
196
+ }
197
+ },
198
+ "position": {
199
+ "x": 429.0,
200
+ "y": 234.0
201
+ },
202
+ "type": "basic"
203
+ },
204
+ "params": {
205
+ "columns": "",
206
+ "file_path": "uploads/task_solver_examples.xlsx",
207
+ "sheet_name": "address_example"
208
+ },
209
+ "status": "done",
210
+ "title": "Read Excel"
211
+ },
212
+ "dragHandle": ".bg-primary",
213
+ "height": 296.0,
214
+ "id": "Read Excel 1",
215
+ "position": {
216
+ "x": 11.0,
217
+ "y": 207.0
218
+ },
219
+ "type": "basic",
220
+ "width": 400.0
221
+ },
222
+ {
223
+ "data": {
224
+ "__execution_delay": 0.0,
225
+ "collapsed": null,
226
+ "display": null,
227
+ "error": null,
228
+ "input_metadata": null,
229
+ "meta": {
230
+ "inputs": {},
231
+ "name": "LynxScribe Message",
232
+ "outputs": {
233
+ "output": {
234
+ "name": "output",
235
+ "position": "top",
236
+ "type": {
237
+ "type": "None"
238
+ }
239
+ }
240
+ },
241
+ "params": {
242
+ "prompt_content": {
243
+ "default": null,
244
+ "name": "prompt_content",
245
+ "type": {
246
+ "format": "textarea"
247
+ }
248
+ },
249
+ "prompt_role": {
250
+ "default": null,
251
+ "name": "prompt_role",
252
+ "type": {
253
+ "enum": [
254
+ "SYSTEM",
255
+ "USER"
256
+ ]
257
+ }
258
+ }
259
+ },
260
+ "position": {
261
+ "x": 653.0,
262
+ "y": 954.0
263
+ },
264
+ "type": "basic"
265
+ },
266
+ "params": {
267
+ "prompt_content": "You are an AI assistant designed to clean and extract structured address information from raw text.\nYour goal is to identify and extract relevant address components while ignoring any unrelated information.\nThe output must be formatted as a structured dictionary.\n\nYour task is to parse an address from raw text and return a dictionary with the following keys:\n - zip_code: The postal or ZIP code.\n - country: The country name.\n - state_or_county: The state, province, or county (if applicable).\n - city: The city or town name.\n - district: The district or borough name (if mentioned).\n - street_type: The type of public space (e.g., street, avenue, boulevard, square).\n - street_name: The name of the public space (e.g., Main, Baker, Champs-\u00c9lys\u00e9es).\n - house_number: The house or building number.\n - floor: The floor number (if mentioned).\n - flat_number: The apartment or unit number (if mentioned).\n - additional_info: Any other useful details, such as building names, or known landmarks.\n\nIf any information is missing from the input, leave the corresponding key as an empty string.\n\nYou must return only a python dictionary with the following keys:\n`zip_code`, `country`, `state_or_county`, `city`, `district`, `street_name`, \n`house_number`, `floor`, `flat_number`, `additional_info`.\n\nDo not include any extra text, comments, or explanations\u2014only return the dictionary.",
268
+ "prompt_role": null
269
+ },
270
+ "status": "done",
271
+ "title": "LynxScribe Message"
272
+ },
273
+ "dragHandle": ".bg-primary",
274
+ "height": 354.0,
275
+ "id": "LynxScribe Message 3",
276
+ "position": {
277
+ "x": 36.0,
278
+ "y": 561.0
279
+ },
280
+ "type": "basic",
281
+ "width": 740.0
282
+ },
283
+ {
284
+ "data": {
285
+ "__execution_delay": 0.0,
286
+ "collapsed": null,
287
+ "display": null,
288
+ "error": null,
289
+ "input_metadata": null,
290
+ "meta": {
291
+ "inputs": {},
292
+ "name": "LynxScribe Message",
293
+ "outputs": {
294
+ "output": {
295
+ "name": "output",
296
+ "position": "top",
297
+ "type": {
298
+ "type": "None"
299
+ }
300
+ }
301
+ },
302
+ "params": {
303
+ "prompt_content": {
304
+ "default": null,
305
+ "name": "prompt_content",
306
+ "type": {
307
+ "format": "textarea"
308
+ }
309
+ },
310
+ "prompt_role": {
311
+ "default": null,
312
+ "name": "prompt_role",
313
+ "type": {
314
+ "enum": [
315
+ "SYSTEM",
316
+ "USER"
317
+ ]
318
+ }
319
+ }
320
+ },
321
+ "position": {
322
+ "x": 1498.0,
323
+ "y": 660.0
324
+ },
325
+ "type": "basic"
326
+ },
327
+ "params": {
328
+ "prompt_content": "Extract structured address information from the following text: {message_parts}",
329
+ "prompt_role": "USER"
330
+ },
331
+ "status": "done",
332
+ "title": "LynxScribe Message"
333
+ },
334
+ "dragHandle": ".bg-primary",
335
+ "height": 347.0,
336
+ "id": "LynxScribe Message 1",
337
+ "position": {
338
+ "x": 817.0,
339
+ "y": 566.0
340
+ },
341
+ "type": "basic",
342
+ "width": 498.0
343
+ }
344
+ ]
345
+ }
examples/LynxScribe Image Search.lynxkite.json CHANGED
@@ -292,14 +292,10 @@
292
  }
293
  }
294
  },
295
- "position": {
296
- "x": 1260.0,
297
- "y": 166.0
298
- },
299
  "type": "basic"
300
  },
301
  "params": {
302
- "chat": "Show me a picture about doctors and patients!"
303
  },
304
  "status": "done",
305
  "title": "Input chat"
@@ -355,10 +351,6 @@
355
  }
356
  }
357
  },
358
- "position": {
359
- "x": 1987.0,
360
- "y": 365.0
361
- },
362
  "type": "basic"
363
  },
364
  "params": {
@@ -379,7 +371,7 @@
379
  },
380
  {
381
  "data": {
382
- "display": "https://storage.googleapis.com/lynxkite_public_data/lynxscribe-images/image-rag-test/bethesda-naval-medical-center-80380_1280.jpg",
383
  "error": null,
384
  "input_metadata": null,
385
  "meta": {
@@ -395,10 +387,6 @@
395
  "name": "LynxScribe Image Result Viewer",
396
  "outputs": {},
397
  "params": {},
398
- "position": {
399
- "x": 2326.0,
400
- "y": 319.0
401
- },
402
  "type": "image"
403
  },
404
  "params": {},
 
292
  }
293
  }
294
  },
 
 
 
 
295
  "type": "basic"
296
  },
297
  "params": {
298
+ "chat": "Show me a picture about fruits"
299
  },
300
  "status": "done",
301
  "title": "Input chat"
 
351
  }
352
  }
353
  },
 
 
 
 
354
  "type": "basic"
355
  },
356
  "params": {
 
371
  },
372
  {
373
  "data": {
374
+ "display": "https://storage.googleapis.com/lynxkite_public_data/lynxscribe-images/image-rag-test/food-405521_1280.jpg",
375
  "error": null,
376
  "input_metadata": null,
377
  "meta": {
 
387
  "name": "LynxScribe Image Result Viewer",
388
  "outputs": {},
389
  "params": {},
 
 
 
 
390
  "type": "image"
391
  },
392
  "params": {},
examples/uploads/task_solver_examples.xlsx ADDED
Binary file (11.3 kB). View file
 
lynxkite-lynxscribe/src/lynxkite_lynxscribe/lynxscribe_ops.py CHANGED
@@ -28,7 +28,7 @@ from lynxscribe.components.chat.processors import (
28
  TruncateHistory,
29
  )
30
  from lynxscribe.components.chat.api import ChatAPI
31
- from lynxscribe.core.models.prompts import ChatCompletionPrompt
32
  from lynxscribe.components.rag.loaders import FAQTemplateLoader
33
 
34
  from lynxkite.core import ops
@@ -56,6 +56,11 @@ class RAGVersion(Enum):
56
  V2 = "v2"
57
 
58
 
 
 
 
 
 
59
  class RAGTemplate(BaseModel):
60
  """
61
  Model for RAG templates consisting of three tables: they are connected via scenario names.
@@ -672,6 +677,113 @@ def chat_processor(processor, *, _ctx: one_by_one.Context):
672
  return {"chat_processor": chat_processor, **cfg}
673
 
674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  @output_on_top
676
  @op("Truncate history")
677
  def truncate_history(*, max_tokens=10000):
@@ -718,6 +830,28 @@ def input_chat(*, chat: str):
718
  return {"text": chat}
719
 
720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  @op("View", view="table_view")
722
  def view(input):
723
  columns = [str(c) for c in input.keys() if not str(c).startswith("_")]
 
28
  TruncateHistory,
29
  )
30
  from lynxscribe.components.chat.api import ChatAPI
31
+ from lynxscribe.core.models.prompts import ChatCompletionPrompt, Message
32
  from lynxscribe.components.rag.loaders import FAQTemplateLoader
33
 
34
  from lynxkite.core import ops
 
56
  V2 = "v2"
57
 
58
 
59
+ class MessageRole(Enum):
60
+ SYSTEM = "system"
61
+ USER = "user"
62
+
63
+
64
  class RAGTemplate(BaseModel):
65
  """
66
  Model for RAG templates consisting of three tables: they are connected via scenario names.
 
677
  return {"chat_processor": chat_processor, **cfg}
678
 
679
 
680
+ @output_on_top
681
+ @op("LynxScribe Message")
682
+ def lynxscribe_message(*, prompt_role: MessageRole, prompt_content: ops.LongStr):
683
+ return_message = Message(role=prompt_role.value, content=prompt_content.strip())
684
+ return {"prompt_message": return_message}
685
+
686
+
687
+ @op("Read Excel")
688
+ def read_excel(*, file_path: str, sheet_name: str = "Sheet1", columns: str = ""):
689
+ """
690
+ Reads an Excel file and returns the content of the specified sheet.
691
+ The columns parameter can be used to specify which columns to include in the output.
692
+ If not specified, all columns will be included (separate the values by comma).
693
+
694
+ TODO: more general: several input/output versions.
695
+ """
696
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
697
+ if columns:
698
+ columns = [c.strip() for c in columns.split(",") if c.strip()]
699
+ columns = [c for c in columns if c in df.columns]
700
+ if len(columns) == 0:
701
+ raise ValueError("No valid columns specified.")
702
+ df = df[columns].copy()
703
+ return df # {"dataframe": df}
704
+
705
+
706
+ @ops.input_position(system_prompt="bottom", instruction_prompt="bottom", df="left")
707
+ @op("LynxScribe Task Solver")
708
+ @mem.cache
709
+ async def ls_task_solver(
710
+ system_prompt,
711
+ instruction_prompt,
712
+ df,
713
+ *,
714
+ llm_interface: str = "openai",
715
+ llm_model_name: str = "gpt-4o",
716
+ new_column_names: str = "processed_field",
717
+ # api_key_name: str = "OPENAI_API_KEY",
718
+ ):
719
+ """
720
+ Solving the described task on a data frame and put the results into a new column.
721
+
722
+ If there are multiple new_column_names provided, the structured dictionary output
723
+ will be split into multiple columns.
724
+ """
725
+
726
+ # handling inputs
727
+ system_message = system_prompt[0]["prompt_message"]
728
+ instruction_message = instruction_prompt[0]["prompt_message"]
729
+
730
+ # preparing output
731
+ out_df = df.copy()
732
+
733
+ # connecting to the LLM
734
+ llm_params = {"name": llm_interface}
735
+ # if api_key_name:
736
+ # llm_params["api_key"] = os.getenv(api_key_name)
737
+ llm = get_llm_engine(**llm_params)
738
+
739
+ # getting the list of fieldnames used in the instruction message
740
+ fieldnames = []
741
+ for pot_fieldname in df.columns:
742
+ if "{" + pot_fieldname + "}" in instruction_message.content:
743
+ fieldnames.append(pot_fieldname)
744
+
745
+ # generate a list of instruction messages (from fieldnames)
746
+ # each row of the df is a separate instruction message
747
+ # TODO: make it fast for large dataframes
748
+ instruction_messages = []
749
+ for i in range(len(df)):
750
+ instruction_message_i = deepcopy(instruction_message)
751
+ for fieldname in fieldnames:
752
+ instruction_message_i.content = instruction_message_i.content.replace(
753
+ "{" + fieldname + "}", str(df.iloc[i][fieldname])
754
+ )
755
+ instruction_messages.append(instruction_message_i)
756
+
757
+ # generate completition prompt
758
+ completion_prompts = [
759
+ ChatCompletionPrompt(
760
+ model=llm_model_name,
761
+ messages=[system_message, instruction_message_j],
762
+ )
763
+ for instruction_message_j in instruction_messages
764
+ ]
765
+
766
+ # get the answers
767
+ tasks = [llm.acreate_completion(completion_prompt=_prompt) for _prompt in completion_prompts]
768
+ out_completions = await asyncio.gather(*tasks)
769
+
770
+ # answer post-processing: 1 vs more columns
771
+ col_list = [_c.strip() for _c in new_column_names.split(",") if _c.strip()]
772
+ if len(col_list) == 0:
773
+ raise ValueError("No valid column names specified.")
774
+ elif len(col_list) == 1:
775
+ out_df[col_list[0]] = [result.choices[0].message.content for result in out_completions]
776
+ else:
777
+ answers = [
778
+ dictionary_corrector(result.choices[0].message.content, expected_keys=col_list)
779
+ for result in out_completions
780
+ ]
781
+ for i, col in enumerate(col_list):
782
+ out_df[col] = [answer[col] for answer in answers]
783
+
784
+ return out_df # {"dataframe": out_df}
785
+
786
+
787
  @output_on_top
788
  @op("Truncate history")
789
  def truncate_history(*, max_tokens=10000):
 
830
  return {"text": chat}
831
 
832
 
833
+ @op("View DataFrame", view="table_view")
834
+ def view_df(input, *, _ctx: one_by_one.Context):
835
+ """
836
+ TODO: This part is not working
837
+ """
838
+ v = _ctx.last_result
839
+ if v:
840
+ columns = v["dataframes"]["df"]["columns"]
841
+ v["dataframes"]["df"]["data"].append([input[c] for c in columns])
842
+ else:
843
+ columns = [str(c) for c in input.keys() if not str(c).startswith("_")]
844
+ v = {
845
+ "dataframes": {
846
+ "df": {
847
+ "columns": columns,
848
+ "data": [[input[c] for c in columns]],
849
+ }
850
+ }
851
+ }
852
+ return v
853
+
854
+
855
  @op("View", view="table_view")
856
  def view(input):
857
  columns = [str(c) for c in input.keys() if not str(c).startswith("_")]