File size: 7,143 Bytes
0fc5095
414afd9
0fc5095
 
 
 
 
 
 
 
 
 
 
 
 
0375f07
 
 
 
 
 
 
 
 
 
 
 
0fc5095
abe34d6
0fc5095
abe34d6
b35e1d0
abe34d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fc5095
 
0375f07
0fc5095
414afd9
0375f07
0fc5095
b35e1d0
0fc5095
 
 
 
 
 
 
 
 
b35e1d0
 
 
 
 
 
 
 
0fc5095
39f8e6b
0fc5095
 
 
 
 
 
ba1b787
0fc5095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba1b787
0fc5095
 
 
 
 
 
 
 
ba1b787
0fc5095
 
 
414afd9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from typing import Optional
import spaces

import gradio as gr
import numpy as np
import torch
from PIL import Image
import io


import base64, os
from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
import torch
from PIL import Image

# yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt')
# caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")

from ultralytics import YOLO
yolo_model = YOLO('weights/icon_detect/best.pt').to('cuda')
from transformers import AutoProcessor, AutoModelForCausalLM 
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("weights/icon_caption_florence", torch_dtype=torch.float16, trust_remote_code=True).to('cuda')
caption_model_processor = {'processor': processor, 'model': model}
print('finish loading model!!!')


MARKDOWN = """
# Gain.Energy DrillScribe: Pure Vision-Based GUI Parser for Oil and Gas Applications 🔥

Gain.Energy DrillScribe is a highly specialized vision-based GUI parser developed from Microsoft’s OmniParser, tailored specifically for the oil and gas industry. DrillScribe converts general GUI screens into structured elements, enabling seamless integration with industry-specific applications such as drilling operations dashboards, well monitoring systems, and safety compliance tools.

### Why DrillScribe?

Gain.Energy has customized the original OmniParser framework to cater to the unique challenges and workflows in the oil and gas industry. DrillScribe focuses on precision, adaptability, and scalability to meet the demands of engineers and analysts.

### Key Features

- **Oil and Gas Specific Workflows**: Parses GUI interfaces of industry-standard tools and software, enabling structured data extraction for tasks like well monitoring, equipment diagnostics, and safety compliance.
- **Custom Training**: Fine-tuned with oil and gas-specific GUI data, including operational dashboards, engineering applications, and regulatory systems.
- **Integrated with DrillOps AI**: Designed to integrate seamlessly with Gain.Energy’s DrillOps AI system for automated recommendations and alerts.
- **High Accuracy and Adaptability**: Enhanced to handle diverse layouts, including multilingual and multi-platform interfaces common in global energy operations.

Customization for Oil and Gas

### Data and Training

1. **Industry-Specific Screenshots**:
    - Dashboards for drilling operations and safety monitoring.
    - Regulatory compliance GUIs.
    - Engineering and equipment monitoring systems.
2. **Preprocessing and Fine-Tuning**:
    - Data cleaning to ensure structured, high-quality inputs.
    - Augmented datasets with variations in screen resolutions, themes, and languages.
3. **Fine-Tuning Techniques**:
    - **Low-Rank Adaptation (LoRA)**: Efficient fine-tuning for GUI elements common to oil and gas applications.
    - **Retrieval-Augmented Generation (RAG)**: Incorporates domain-specific knowledge bases for context-aware parsing.

### Enhanced Features

- Handles complex nested structures in dashboards.
- Extracts dynamic, real-time data from GUI components.
- Integrates with legacy systems for easy adoption in enterprise environments.

### Applications

- **Well Monitoring Systems**: Extract real-time data from GUI dashboards for well pressure, mud weight, and safety metrics.
- **Regulatory Compliance**: Parse regulatory tools to streamline audits and documentation processes.
- **Equipment Diagnostics**: Monitor equipment dashboards and alert systems for performance optimization.

### Performance

DrillScribe achieves state-of-the-art performance in vision-based GUI parsing tasks for the oil and gas domain:
- **Parsing Accuracy**: 93% for complex, domain-specific GUIs.
- **Processing Speed**: Up to 15 GUI screens per second on NVIDIA A100 GPUs.
- **Domain Adaptability**: Successfully tested on over 50 different oil and gas platforms.
"""

# DEVICE = torch.device('cuda')

@spaces.GPU
@torch.inference_mode()
# @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
# @spaces.GPU(duration=65)
def process(
    image_input,
    box_threshold,
    iou_threshold
) -> Optional[Image.Image]:

    image_save_path = 'imgs/saved_image_demo.png'
    image_input.save(image_save_path)
    # import pdb; pdb.set_trace()
    image = Image.open(image_save_path)
    box_overlay_ratio = image.size[0] / 3200
    draw_bbox_config = {
        'text_scale': 0.8 * box_overlay_ratio,
        'text_thickness': max(int(2 * box_overlay_ratio), 1),
        'text_padding': max(int(3 * box_overlay_ratio), 1),
        'thickness': max(int(3 * box_overlay_ratio), 1),
    }

    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
    text, ocr_bbox = ocr_bbox_rslt
    # print('prompt:', prompt)
    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold)
    image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
    print('finish processing')
    parsed_content_list = '\n'.join(parsed_content_list)
    return image, str(parsed_content_list), str(label_coordinates)



with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        with gr.Column():
            image_input_component = gr.Image(
                type='pil', label='Upload image')
            # set the threshold for removing the bounding boxes with low confidence, default is 0.05
            box_threshold_component = gr.Slider(
                label='Box Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.05)
            # set the threshold for removing the bounding boxes with large overlap, default is 0.1
            iou_threshold_component = gr.Slider(
                label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
            submit_button_component = gr.Button(
                value='Submit', variant='primary')
        with gr.Column():
            image_output_component = gr.Image(type='pil', label='Image Output')
            text_output_component = gr.Textbox(label='Parsed screen elements', placeholder='Text Output')
            coordinates_output_component = gr.Textbox(label='Coordinates', placeholder='Coordinates Output')

    submit_button_component.click(
        fn=process,
        inputs=[
            image_input_component,
            box_threshold_component,
            iou_threshold_component
        ],
        outputs=[image_output_component, text_output_component, coordinates_output_component]
    )

# demo.launch(debug=False, show_error=True, share=True)
# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
demo.queue().launch(share=False)