|
|
|
import os |
|
from openai import AsyncOpenAI |
|
|
|
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
|
class TaskDecomposer: |
|
def __init__(self): |
|
pass |
|
|
|
async def decompose(self, context): |
|
base_prompt = self._build_prompt(context) |
|
|
|
response = await client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "system", "content": "You are a task planner that breaks down multimodal user goals into executable subtasks."}, |
|
{"role": "user", "content": base_prompt} |
|
] |
|
) |
|
|
|
content = response.choices[0].message.content |
|
return self._parse_subtasks(content) |
|
|
|
def _build_prompt(self, context): |
|
description = [] |
|
if "text" in context: |
|
description.append(f"Text: {context['text']}") |
|
if "image_summary" in context: |
|
description.append(f"Image summary: {context['image_summary']}") |
|
if "video_summary" in context: |
|
description.append(f"Video summary: {context['video_summary']}") |
|
|
|
combined = "\n".join(description) |
|
prompt = f""" |
|
Given the following multimodal input, generate a list of clear, web-searchable subtasks needed to achieve the user's goal. Output the list in JSON array format, with each item as an object containing 'query', 'language', and 'modality'. |
|
|
|
{combined} |
|
""" |
|
return prompt |
|
|
|
def _parse_subtasks(self, llm_output): |
|
import json |
|
try: |
|
return json.loads(llm_output) |
|
except: |
|
return [] |
|
|