File size: 1,590 Bytes
21f9bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# modules/task_decomposer.py
import os
from openai import AsyncOpenAI

client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

class TaskDecomposer:
    def __init__(self):
        pass

    async def decompose(self, context):
        base_prompt = self._build_prompt(context)

        response = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a task planner that breaks down multimodal user goals into executable subtasks."},
                {"role": "user", "content": base_prompt}
            ]
        )

        content = response.choices[0].message.content
        return self._parse_subtasks(content)

    def _build_prompt(self, context):
        description = []
        if "text" in context:
            description.append(f"Text: {context['text']}")
        if "image_summary" in context:
            description.append(f"Image summary: {context['image_summary']}")
        if "video_summary" in context:
            description.append(f"Video summary: {context['video_summary']}")

        combined = "\n".join(description)
        prompt = f"""
Given the following multimodal input, generate a list of clear, web-searchable subtasks needed to achieve the user's goal. Output the list in JSON array format, with each item as an object containing 'query', 'language', and 'modality'.

{combined}
"""
        return prompt

    def _parse_subtasks(self, llm_output):
        import json
        try:
            return json.loads(llm_output)
        except:
            return []