阳渠 commited on
Commit
1a06ab4
·
1 Parent(s): 9a3f6f1
Files changed (2) hide show
  1. app.py +7 -4
  2. requirements.txt +8 -1
app.py CHANGED
@@ -12,11 +12,12 @@ import gradio as gr
12
  from datetime import datetime
13
  from modelscope.pipelines import pipeline
14
  from modelscope import snapshot_download
 
15
  from PIL import Image, ImageDraw, ImageFont
16
 
17
  from PCAgent.api import inference_chat
18
  from PCAgent.icon_localization import det
19
- from PCAgent.text_localization import ocr
20
  from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt
21
  from PCAgent.chat import init_action_chat, init_memory_chat, add_response
22
  from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt
@@ -26,8 +27,10 @@ vl_model_version = os.environ.get('vl_model_version')
26
  llm_model_version = os.environ.get('llm_model_version')
27
  API_url = os.environ.get('API_url')
28
  token = os.environ.get('token')
29
- os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
30
- os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
 
 
31
  tff_file = os.environ.get('tff_file')
32
  radius = 100
33
 
@@ -127,7 +130,7 @@ def get_perception_infos(screenshot_file, screenshot_som_file, font_path):
127
 
128
  for i, img in enumerate(img_list):
129
  width, height = Image.open(img).size
130
- sub_text, sub_coordinates = ocr(img) # for api
131
  for coordinate in sub_coordinates:
132
  coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
133
  coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
 
12
  from datetime import datetime
13
  from modelscope.pipelines import pipeline
14
  from modelscope import snapshot_download
15
+ from modelscope.utils.constant import Tasks
16
  from PIL import Image, ImageDraw, ImageFont
17
 
18
  from PCAgent.api import inference_chat
19
  from PCAgent.icon_localization import det
20
+ from PCAgent.text_localization_old import ocr
21
  from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt
22
  from PCAgent.chat import init_action_chat, init_memory_chat, add_response
23
  from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt
 
27
  llm_model_version = os.environ.get('llm_model_version')
28
  API_url = os.environ.get('API_url')
29
  token = os.environ.get('token')
30
+ # os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
31
+ # os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
32
+ ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo')
33
+ ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo')
34
  tff_file = os.environ.get('tff_file')
35
  radius = 100
36
 
 
130
 
131
  for i, img in enumerate(img_list):
132
  width, height = Image.open(img).size
133
+ sub_text, sub_coordinates = ocr(img, ocr_detection, ocr_recognition) # for api
134
  for coordinate in sub_coordinates:
135
  coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
136
  coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
requirements.txt CHANGED
@@ -11,4 +11,11 @@ transformers
11
  torchvision
12
  pycocotools
13
  timm
14
- termcolor
 
 
 
 
 
 
 
 
11
  torchvision
12
  pycocotools
13
  timm
14
+ termcolor
15
+ TensorFlow==2.9.1
16
+ keras==2.9.0
17
+ SentencePiece
18
+ tf_slim
19
+ tf_keras==2.15.0
20
+ pyclipper
21
+ numpy==1.26.4