tommytracx's picture
update
abe34d6
raw
history blame
7.14 kB
from typing import Optional
import spaces
import gradio as gr
import numpy as np
import torch
from PIL import Image
import io
import base64, os
from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
import torch
from PIL import Image
# yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt')
# caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
from ultralytics import YOLO
yolo_model = YOLO('weights/icon_detect/best.pt').to('cuda')
from transformers import AutoProcessor, AutoModelForCausalLM
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("weights/icon_caption_florence", torch_dtype=torch.float16, trust_remote_code=True).to('cuda')
caption_model_processor = {'processor': processor, 'model': model}
print('finish loading model!!!')
MARKDOWN = """
# Gain.Energy DrillScribe: Pure Vision-Based GUI Parser for Oil and Gas Applications 🔥
Gain.Energy DrillScribe is a highly specialized vision-based GUI parser developed from Microsoft’s OmniParser, tailored specifically for the oil and gas industry. DrillScribe converts general GUI screens into structured elements, enabling seamless integration with industry-specific applications such as drilling operations dashboards, well monitoring systems, and safety compliance tools.
### Why DrillScribe?
Gain.Energy has customized the original OmniParser framework to cater to the unique challenges and workflows in the oil and gas industry. DrillScribe focuses on precision, adaptability, and scalability to meet the demands of engineers and analysts.
### Key Features
- **Oil and Gas Specific Workflows**: Parses GUI interfaces of industry-standard tools and software, enabling structured data extraction for tasks like well monitoring, equipment diagnostics, and safety compliance.
- **Custom Training**: Fine-tuned with oil and gas-specific GUI data, including operational dashboards, engineering applications, and regulatory systems.
- **Integrated with DrillOps AI**: Designed to integrate seamlessly with Gain.Energy’s DrillOps AI system for automated recommendations and alerts.
- **High Accuracy and Adaptability**: Enhanced to handle diverse layouts, including multilingual and multi-platform interfaces common in global energy operations.
Customization for Oil and Gas
### Data and Training
1. **Industry-Specific Screenshots**:
- Dashboards for drilling operations and safety monitoring.
- Regulatory compliance GUIs.
- Engineering and equipment monitoring systems.
2. **Preprocessing and Fine-Tuning**:
- Data cleaning to ensure structured, high-quality inputs.
- Augmented datasets with variations in screen resolutions, themes, and languages.
3. **Fine-Tuning Techniques**:
- **Low-Rank Adaptation (LoRA)**: Efficient fine-tuning for GUI elements common to oil and gas applications.
- **Retrieval-Augmented Generation (RAG)**: Incorporates domain-specific knowledge bases for context-aware parsing.
### Enhanced Features
- Handles complex nested structures in dashboards.
- Extracts dynamic, real-time data from GUI components.
- Integrates with legacy systems for easy adoption in enterprise environments.
### Applications
- **Well Monitoring Systems**: Extract real-time data from GUI dashboards for well pressure, mud weight, and safety metrics.
- **Regulatory Compliance**: Parse regulatory tools to streamline audits and documentation processes.
- **Equipment Diagnostics**: Monitor equipment dashboards and alert systems for performance optimization.
### Performance
DrillScribe achieves state-of-the-art performance in vision-based GUI parsing tasks for the oil and gas domain:
- **Parsing Accuracy**: 93% for complex, domain-specific GUIs.
- **Processing Speed**: Up to 15 GUI screens per second on NVIDIA A100 GPUs.
- **Domain Adaptability**: Successfully tested on over 50 different oil and gas platforms.
"""
# DEVICE = torch.device('cuda')
@spaces.GPU
@torch.inference_mode()
# @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
# @spaces.GPU(duration=65)
def process(
image_input,
box_threshold,
iou_threshold
) -> Optional[Image.Image]:
image_save_path = 'imgs/saved_image_demo.png'
image_input.save(image_save_path)
# import pdb; pdb.set_trace()
image = Image.open(image_save_path)
box_overlay_ratio = image.size[0] / 3200
draw_bbox_config = {
'text_scale': 0.8 * box_overlay_ratio,
'text_thickness': max(int(2 * box_overlay_ratio), 1),
'text_padding': max(int(3 * box_overlay_ratio), 1),
'thickness': max(int(3 * box_overlay_ratio), 1),
}
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
text, ocr_bbox = ocr_bbox_rslt
# print('prompt:', prompt)
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold)
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
print('finish processing')
parsed_content_list = '\n'.join(parsed_content_list)
return image, str(parsed_content_list), str(label_coordinates)
with gr.Blocks() as demo:
gr.Markdown(MARKDOWN)
with gr.Row():
with gr.Column():
image_input_component = gr.Image(
type='pil', label='Upload image')
# set the threshold for removing the bounding boxes with low confidence, default is 0.05
box_threshold_component = gr.Slider(
label='Box Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.05)
# set the threshold for removing the bounding boxes with large overlap, default is 0.1
iou_threshold_component = gr.Slider(
label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
submit_button_component = gr.Button(
value='Submit', variant='primary')
with gr.Column():
image_output_component = gr.Image(type='pil', label='Image Output')
text_output_component = gr.Textbox(label='Parsed screen elements', placeholder='Text Output')
coordinates_output_component = gr.Textbox(label='Coordinates', placeholder='Coordinates Output')
submit_button_component.click(
fn=process,
inputs=[
image_input_component,
box_threshold_component,
iou_threshold_component
],
outputs=[image_output_component, text_output_component, coordinates_output_component]
)
# demo.launch(debug=False, show_error=True, share=True)
# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
demo.queue().launch(share=False)