whyumesh commited on
Commit
07de076
·
verified ·
1 Parent(s): 48da597

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -17
app.py CHANGED
@@ -1,31 +1,32 @@
 
1
  import torch
2
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
  from qwen_vl_utils import process_vision_info
4
  from PIL import Image
5
  import cv2
6
  import numpy as np
7
- import gradio as gr
8
  import spaces
9
 
10
  # Load the model and processor
 
11
  def load_model():
12
  model = Qwen2VLForConditionalGeneration.from_pretrained(
13
  "Qwen/Qwen2-VL-2B-Instruct",
14
  torch_dtype=torch.float16
15
- )
16
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
17
  return model, processor
18
 
19
  model, processor = load_model()
20
 
21
  @spaces.GPU
22
- def process_image(image):
23
  messages = [
24
  {
25
  "role": "user",
26
  "content": [
27
  {"type": "image", "image": image},
28
- {"type": "text", "text": "Describe this image."},
29
  ],
30
  }
31
  ]
@@ -40,7 +41,7 @@ def process_image(image):
40
  padding=True,
41
  return_tensors="pt",
42
  ).to("cuda")
43
-
44
  with torch.no_grad():
45
  generated_ids = model.generate(**inputs, max_new_tokens=256)
46
  generated_ids_trimmed = [
@@ -53,7 +54,7 @@ def process_image(image):
53
  return output_text[0]
54
 
55
  @spaces.GPU
56
- def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
57
  cap = cv2.VideoCapture(video_path)
58
  frames = []
59
  frame_count = 0
@@ -83,7 +84,7 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
83
  "role": "user",
84
  "content": [
85
  {"type": "video", "video": frames},
86
- {"type": "text", "text": "Describe this video."},
87
  ],
88
  }
89
  ]
@@ -99,6 +100,9 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
99
  return_tensors="pt",
100
  ).to("cuda")
101
 
 
 
 
102
  with torch.no_grad():
103
  generated_ids = model.generate(**inputs, max_new_tokens=256)
104
  generated_ids_trimmed = [
@@ -111,25 +115,26 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
111
  return output_text[0]
112
 
113
  @spaces.GPU
114
- def process_content(content):
115
  if content is None:
116
  return "Please upload an image or video file."
117
 
118
- if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
119
- return process_image(Image.open(content.name))
120
  elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
121
- return process_video(content.name)
122
  else:
123
  return "Unsupported file type. Please provide an image or video file."
124
 
125
- # Gradio interface
126
  iface = gr.Interface(
127
  fn=process_content,
128
- inputs=gr.File(label="Upload Image or Video"),
 
 
 
129
  outputs="text",
130
- title="Image and Video Description",
131
- description="Upload an image or video to get a description.",
132
  )
133
 
134
- if __name__ == "__main__":
135
- iface.launch()
 
1
+ import gradio as gr
2
  import torch
3
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
  from PIL import Image
6
  import cv2
7
  import numpy as np
 
8
  import spaces
9
 
10
  # Load the model and processor
11
+ @spaces.GPU
12
  def load_model():
13
  model = Qwen2VLForConditionalGeneration.from_pretrained(
14
  "Qwen/Qwen2-VL-2B-Instruct",
15
  torch_dtype=torch.float16
16
+ ).to("cuda")
17
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
18
  return model, processor
19
 
20
  model, processor = load_model()
21
 
22
  @spaces.GPU
23
+ def process_image(image, prompt):
24
  messages = [
25
  {
26
  "role": "user",
27
  "content": [
28
  {"type": "image", "image": image},
29
+ {"type": "text", "text": prompt},
30
  ],
31
  }
32
  ]
 
41
  padding=True,
42
  return_tensors="pt",
43
  ).to("cuda")
44
+
45
  with torch.no_grad():
46
  generated_ids = model.generate(**inputs, max_new_tokens=256)
47
  generated_ids_trimmed = [
 
54
  return output_text[0]
55
 
56
  @spaces.GPU
57
+ def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
58
  cap = cv2.VideoCapture(video_path)
59
  frames = []
60
  frame_count = 0
 
84
  "role": "user",
85
  "content": [
86
  {"type": "video", "video": frames},
87
+ {"type": "text", "text": prompt},
88
  ],
89
  }
90
  ]
 
100
  return_tensors="pt",
101
  ).to("cuda")
102
 
103
+ del frames, image_inputs, video_inputs
104
+ torch.cuda.empty_cache()
105
+
106
  with torch.no_grad():
107
  generated_ids = model.generate(**inputs, max_new_tokens=256)
108
  generated_ids_trimmed = [
 
115
  return output_text[0]
116
 
117
  @spaces.GPU
118
+ def process_content(content, prompt):
119
  if content is None:
120
  return "Please upload an image or video file."
121
 
122
+ if isinstance(content, Image.Image):
123
+ return process_image(content, prompt)
124
  elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
125
+ return process_video(content.name, prompt)
126
  else:
127
  return "Unsupported file type. Please provide an image or video file."
128
 
 
129
  iface = gr.Interface(
130
  fn=process_content,
131
+ inputs=[
132
+ gr.File(label="Upload Image or Video", type="filepath"),
133
+ gr.Textbox(label="Enter your prompt or task description")
134
+ ],
135
  outputs="text",
136
+ title="Image and Video Description with Custom Prompt",
137
+ description="Upload an image or video and specify a task to get a response.",
138
  )
139
 
140
+ iface.launch()