prithivMLmods commited on
Commit
ebca0ae
·
verified ·
1 Parent(s): c8414ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -21
app.py CHANGED
@@ -1,13 +1,16 @@
1
  import gradio as gr
 
2
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
3
  from transformers.image_utils import load_image
4
  from threading import Thread
5
  import time
6
  import torch
7
- import spaces
 
 
8
 
9
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
10
- MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
11
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
12
  model = Qwen2VLForConditionalGeneration.from_pretrained(
13
  MODEL_ID,
@@ -15,31 +18,65 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
15
  torch_dtype=torch.float16
16
  ).to("cuda").eval()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @spaces.GPU
19
  def model_inference(input_dict, history):
20
  text = input_dict["text"]
21
  files = input_dict["files"]
22
 
23
- # Load images if provided
24
- if len(files) > 1:
25
- images = [load_image(image) for image in files if image.endswith(('png', 'jpg', 'jpeg'))]
26
- videos = [video for video in files if video.endswith(('mp4', 'avi', 'mov'))]
27
- elif len(files) == 1:
28
- if files[0].endswith(('png', 'jpg', 'jpeg')):
29
- images = [load_image(files[0])]
30
- videos = []
31
  else:
32
- images = []
33
- videos = [files[0]]
34
- else:
35
- images = []
36
- videos = []
 
 
37
 
38
  # Validate input
39
- if text == "" and not images and not videos:
40
  gr.Error("Please input a query and optionally image(s) or video(s).")
41
  return
42
- if text == "" and (images or videos):
43
  gr.Error("Please input a text query along with the image(s) or video(s).")
44
  return
45
 
@@ -48,8 +85,7 @@ def model_inference(input_dict, history):
48
  {
49
  "role": "user",
50
  "content": [
51
- *[{"type": "image", "image": image} for image in images],
52
- *[{"type": "video", "video": video} for video in videos],
53
  {"type": "text", "text": text},
54
  ],
55
  }
@@ -59,8 +95,8 @@ def model_inference(input_dict, history):
59
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
  inputs = processor(
61
  text=[prompt],
62
- images=images if images else None,
63
- videos=videos if videos else None,
64
  return_tensors="pt",
65
  padding=True,
66
  ).to("cuda")
 
1
  import gradio as gr
2
+ import spaces
3
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
4
  from transformers.image_utils import load_image
5
  from threading import Thread
6
  import time
7
  import torch
8
+ from PIL import Image
9
+ import uuid
10
+ import io
11
 
12
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
13
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
14
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
15
  model = Qwen2VLForConditionalGeneration.from_pretrained(
16
  MODEL_ID,
 
18
  torch_dtype=torch.float16
19
  ).to("cuda").eval()
20
 
21
+ # Supported media extensions
22
+ image_extensions = Image.registered_extensions()
23
+ video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
24
+
25
+ def identify_and_save_blob(blob_path):
26
+ """Identifies if the blob is an image or video and saves it accordingly."""
27
+ try:
28
+ with open(blob_path, 'rb') as file:
29
+ blob_content = file.read()
30
+
31
+ # Try to identify if it's an image
32
+ try:
33
+ Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
34
+ extension = ".png" # Default to PNG for saving
35
+ media_type = "image"
36
+ except (IOError, SyntaxError):
37
+ # If it's not a valid image, assume it's a video
38
+ extension = ".mp4" # Default to MP4 for saving
39
+ media_type = "video"
40
+
41
+ # Create a unique filename
42
+ filename = f"temp_{uuid.uuid4()}_media{extension}"
43
+ with open(filename, "wb") as f:
44
+ f.write(blob_content)
45
+
46
+ return filename, media_type
47
+
48
+ except FileNotFoundError:
49
+ raise ValueError(f"The file {blob_path} was not found.")
50
+ except Exception as e:
51
+ raise ValueError(f"An error occurred while processing the file: {e}")
52
+
53
  @spaces.GPU
54
  def model_inference(input_dict, history):
55
  text = input_dict["text"]
56
  files = input_dict["files"]
57
 
58
+ # Process media files (images or videos)
59
+ media_paths = []
60
+ media_types = []
61
+ for file in files:
62
+ if file.endswith(tuple([i for i, f in image_extensions.items()])):
63
+ media_type = "image"
64
+ elif file.endswith(video_extensions):
65
+ media_type = "video"
66
  else:
67
+ try:
68
+ file, media_type = identify_and_save_blob(file)
69
+ except Exception as e:
70
+ gr.Error(f"Unsupported media type: {e}")
71
+ return
72
+ media_paths.append(file)
73
+ media_types.append(media_type)
74
 
75
  # Validate input
76
+ if text == "" and not media_paths:
77
  gr.Error("Please input a query and optionally image(s) or video(s).")
78
  return
79
+ if text == "" and media_paths:
80
  gr.Error("Please input a text query along with the image(s) or video(s).")
81
  return
82
 
 
85
  {
86
  "role": "user",
87
  "content": [
88
+ *[{"type": media_type, media_type: media_path} for media_path, media_type in zip(media_paths, media_types)],
 
89
  {"type": "text", "text": text},
90
  ],
91
  }
 
95
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
96
  inputs = processor(
97
  text=[prompt],
98
+ images=[load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"],
99
+ videos=[path for path, media_type in zip(media_paths, media_types) if media_type == "video"],
100
  return_tensors="pt",
101
  padding=True,
102
  ).to("cuda")