whyumesh commited on
Commit
48da597
·
verified ·
1 Parent(s): cb893dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -125
app.py CHANGED
@@ -5,147 +5,130 @@ from PIL import Image
5
  import cv2
6
  import numpy as np
7
  import gradio as gr
8
-
9
- # Check GPU availability
10
- if not torch.cuda.is_available():
11
- raise RuntimeError("This application requires a GPU to run. No GPU detected.")
12
 
13
  # Load the model and processor
14
  def load_model():
15
- try:
16
- model = Qwen2VLForConditionalGeneration.from_pretrained(
17
- "Qwen/Qwen2-VL-2B-Instruct",
18
- torch_dtype=torch.float16 # Use float16 for GPU
19
- ).to("cuda") # Explicitly use CUDA
20
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
21
- return model, processor
22
- except RuntimeError as e:
23
- print(f"Error loading model: {e}")
24
- raise
25
-
26
- try:
27
- model, processor = load_model()
28
- except Exception as e:
29
- print(f"Failed to load model: {e}")
30
- raise
31
 
32
- def process_image(image):
33
- try:
34
- messages = [
35
- {
36
- "role": "user",
37
- "content": [
38
- {"type": "image", "image": image},
39
- {"type": "text", "text": "Describe this image."},
40
- ],
41
- }
42
- ]
43
-
44
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
45
- image_inputs, video_inputs = process_vision_info(messages)
46
-
47
- inputs = processor(
48
- text=[text],
49
- images=image_inputs,
50
- videos=video_inputs,
51
- padding=True,
52
- return_tensors="pt",
53
- ).to("cuda") # Explicitly use CUDA
54
-
55
- with torch.no_grad():
56
- generated_ids = model.generate(**inputs, max_new_tokens=256)
57
- generated_ids_trimmed = [
58
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
59
- ]
60
- output_text = processor.batch_decode(
61
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
62
- )
63
-
64
- return output_text[0]
65
- except Exception as e:
66
- return f"An error occurred while processing the image: {str(e)}"
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
69
- try:
70
- cap = cv2.VideoCapture(video_path)
71
- frames = []
72
- frame_count = 0
73
-
74
- while len(frames) < max_frames:
75
- ret, frame = cap.read()
76
- if not ret:
77
- break
78
-
79
- if frame_count % frame_interval == 0:
80
- h, w = frame.shape[:2]
81
- if h > w:
82
- new_h, new_w = max_resolution, int(w * max_resolution / h)
83
- else:
84
- new_h, new_w = int(h * max_resolution / w), max_resolution
85
- frame = cv2.resize(frame, (new_w, new_h))
86
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
87
- frame = Image.fromarray(frame)
88
- frames.append(frame)
89
-
90
- frame_count += 1
91
-
92
- cap.release()
93
-
94
- messages = [
95
- {
96
- "role": "user",
97
- "content": [
98
- {"type": "video", "video": frames},
99
- {"type": "text", "text": "Describe this video."},
100
- ],
101
- }
102
- ]
103
-
104
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
105
- image_inputs, video_inputs = process_vision_info(messages)
106
-
107
- inputs = processor(
108
- text=[text],
109
- images=image_inputs,
110
- videos=video_inputs,
111
- padding=True,
112
- return_tensors="pt",
113
- ).to("cuda") # Explicitly use CUDA
114
-
115
- with torch.no_grad():
116
- generated_ids = model.generate(**inputs, max_new_tokens=256)
117
- generated_ids_trimmed = [
118
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
119
- ]
120
- output_text = processor.batch_decode(
121
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
122
- )
123
-
124
- return output_text[0]
125
- except Exception as e:
126
- return f"An error occurred while processing the video: {str(e)}"
127
-
128
  def process_content(content):
129
  if content is None:
130
  return "Please upload an image or video file."
131
 
132
- try:
133
- if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
134
- return process_image(Image.open(content.name))
135
- elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
136
- return process_video(content.name)
137
- else:
138
- return "Unsupported file type. Please provide an image or video file."
139
- except Exception as e:
140
- return f"An error occurred while processing the content: {str(e)}"
141
 
142
  # Gradio interface
143
  iface = gr.Interface(
144
  fn=process_content,
145
  inputs=gr.File(label="Upload Image or Video"),
146
  outputs="text",
147
- title="Image and Video Description (GPU Version)",
148
- description="Upload an image or video to get a description. This application requires GPU computation.",
149
  )
150
 
151
  if __name__ == "__main__":
 
5
  import cv2
6
  import numpy as np
7
  import gradio as gr
8
+ import spaces
 
 
 
9
 
10
  # Load the model and processor
11
  def load_model():
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
13
+ "Qwen/Qwen2-VL-2B-Instruct",
14
+ torch_dtype=torch.float16
15
+ )
16
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
17
+ return model, processor
 
 
 
 
 
 
 
 
 
 
18
 
19
+ model, processor = load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ @spaces.GPU
22
+ def process_image(image):
23
+ messages = [
24
+ {
25
+ "role": "user",
26
+ "content": [
27
+ {"type": "image", "image": image},
28
+ {"type": "text", "text": "Describe this image."},
29
+ ],
30
+ }
31
+ ]
32
+
33
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
34
+ image_inputs, video_inputs = process_vision_info(messages)
35
+
36
+ inputs = processor(
37
+ text=[text],
38
+ images=image_inputs,
39
+ videos=video_inputs,
40
+ padding=True,
41
+ return_tensors="pt",
42
+ ).to("cuda")
43
+
44
+ with torch.no_grad():
45
+ generated_ids = model.generate(**inputs, max_new_tokens=256)
46
+ generated_ids_trimmed = [
47
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
48
+ ]
49
+ output_text = processor.batch_decode(
50
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
51
+ )
52
+
53
+ return output_text[0]
54
+
55
+ @spaces.GPU
56
  def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
57
+ cap = cv2.VideoCapture(video_path)
58
+ frames = []
59
+ frame_count = 0
60
+
61
+ while len(frames) < max_frames:
62
+ ret, frame = cap.read()
63
+ if not ret:
64
+ break
65
+
66
+ if frame_count % frame_interval == 0:
67
+ h, w = frame.shape[:2]
68
+ if h > w:
69
+ new_h, new_w = max_resolution, int(w * max_resolution / h)
70
+ else:
71
+ new_h, new_w = int(h * max_resolution / w), max_resolution
72
+ frame = cv2.resize(frame, (new_w, new_h))
73
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
74
+ frame = Image.fromarray(frame)
75
+ frames.append(frame)
76
+
77
+ frame_count += 1
78
+
79
+ cap.release()
80
+
81
+ messages = [
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {"type": "video", "video": frames},
86
+ {"type": "text", "text": "Describe this video."},
87
+ ],
88
+ }
89
+ ]
90
+
91
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
92
+ image_inputs, video_inputs = process_vision_info(messages)
93
+
94
+ inputs = processor(
95
+ text=[text],
96
+ images=image_inputs,
97
+ videos=video_inputs,
98
+ padding=True,
99
+ return_tensors="pt",
100
+ ).to("cuda")
101
+
102
+ with torch.no_grad():
103
+ generated_ids = model.generate(**inputs, max_new_tokens=256)
104
+ generated_ids_trimmed = [
105
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
106
+ ]
107
+ output_text = processor.batch_decode(
108
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
109
+ )
110
+
111
+ return output_text[0]
112
+
113
+ @spaces.GPU
 
 
114
  def process_content(content):
115
  if content is None:
116
  return "Please upload an image or video file."
117
 
118
+ if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
119
+ return process_image(Image.open(content.name))
120
+ elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
121
+ return process_video(content.name)
122
+ else:
123
+ return "Unsupported file type. Please provide an image or video file."
 
 
 
124
 
125
  # Gradio interface
126
  iface = gr.Interface(
127
  fn=process_content,
128
  inputs=gr.File(label="Upload Image or Video"),
129
  outputs="text",
130
+ title="Image and Video Description",
131
+ description="Upload an image or video to get a description.",
132
  )
133
 
134
  if __name__ == "__main__":