computer_use_ootb / install_tools /test_ui-tars_server.py
baqr's picture
Upload folder using huggingface_hub
d73c58e verified
from openai import OpenAI
from computer_use_demo.gui_agent.llm_utils.oai import encode_image
_NAV_SYSTEM_GROUNDING = """
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
```Action: ...```
## Action Space
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
hotkey(key='')
type(content='') #If you want to submit your input, use \"\" at the end of `content`.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
## Note
- Do not generate any other text.
"""
def get_prompt_grounding(task):
return f"""{task}"""
task = """
```json
{{ "Observation": "I am on the google homepage of the Chrome browser.",
"Thinking": "The user wants to buy a lap-top on Amazon.com, so I need to click on the address (search) bar of Chrome for entering the 'Amazon.com'.",
"Next Action": ["I need to click DSML"],
"Expectation": "The search button is activated after being clicked, ready to input."
}}```
"""
task = """
```json
{{
"Observation": "I am on the google homepage of the Chrome browser.",
"Thinking": "The user wants to click DSML",
"Next Action": ["I need to click DSML"],
}}```
"""
task = """
```json
{{
"Observation": "I am on the google homepage of the Chrome browser.",
"Thinking": "The user wants to click Youtube",
"Next Action": ["I need to click Youtube"],
}}```
"""
if __name__ == "__main__":
ui_tars_url = "https://your_api_to_uitars.com/v1"
ui_tars_client = OpenAI(base_url=ui_tars_url, api_key="")
grounding_system_prompt = _NAV_SYSTEM_GROUNDING.format()
screenshot_base64 = encode_image("./chrome.png")
prompted_message = get_prompt_grounding(task)
print(f"grounding_system_prompt, {grounding_system_prompt}, \
prompted_message: {prompted_message}")
response = ui_tars_client.chat.completions.create(
model="ui-tars",
messages=[
{"role": "user", "content": grounding_system_prompt},
{"role": "user", "content": [
{"type": "text", "text": prompted_message},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_base64}"}}
]
},
],
max_tokens=128,
temperature=0
)
ui_tars_action = response.choices[0].message.content
print(response.choices[0].message.content)