whyumesh commited on
Commit
b3c78b4
·
verified ·
1 Parent(s): d15f3b3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ import torch
5
+ from PIL import Image
6
+ import cv2
7
+ import numpy as np
8
+ import os
9
+
10
+ def load_model():
11
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
12
+ "Qwen/Qwen2-VL-2B-Instruct",
13
+ torch_dtype=torch.float32
14
+ )
15
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
16
+ return model, processor
17
+
18
+ model, processor = load_model()
19
+
20
+ SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
21
+ 1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
22
+ 2. If you see a function or algorithm, explain its purpose and how it works.
23
+ 3. If you see a technical diagram or flowchart, interpret its meaning and purpose.
24
+ 4. For any technical content, provide detailed explanations and insights.
25
+ Always maintain a professional and technical tone in your responses."""
26
+
27
+ def process_content(file, user_prompt):
28
+ if file is None:
29
+ return "No content provided. Please upload an image or video of technical content."
30
+
31
+ file_path = file.name
32
+ file_extension = os.path.splitext(file_path)[1].lower()
33
+
34
+ if file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
35
+ image = Image.open(file_path)
36
+ return analyze_image(image, user_prompt)
37
+ elif file_extension in ['.mp4', '.avi', '.mov']:
38
+ return analyze_video(file_path, user_prompt)
39
+ else:
40
+ return "Unsupported file type. Please provide an image (jpg, jpeg, png, bmp) or video (mp4, avi, mov) of technical content."
41
+
42
+ def analyze_image(image, prompt):
43
+ messages = [
44
+ {"role": "system", "content": SYSTEM_PROMPT},
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {"type": "image", "image": image},
49
+ {"type": "text", "text": f"Based on the system instructions, {prompt}"},
50
+ ],
51
+ }
52
+ ]
53
+
54
+ return generate_response(messages)
55
+
56
+ def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
57
+ cap = cv2.VideoCapture(video_path)
58
+ frames = []
59
+ frame_count = 0
60
+
61
+ while len(frames) < max_frames:
62
+ ret, frame = cap.read()
63
+ if not ret:
64
+ break
65
+
66
+ if frame_count % frame_interval == 0:
67
+ h, w = frame.shape[:2]
68
+ if h > w:
69
+ new_h, new_w = max_resolution, int(w * max_resolution / h)
70
+ else:
71
+ new_h, new_w = int(h * max_resolution / w), max_resolution
72
+ frame = cv2.resize(frame, (new_w, new_h))
73
+
74
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75
+ frame = Image.fromarray(frame)
76
+
77
+ frames.append(frame)
78
+
79
+ frame_count += 1
80
+
81
+ cap.release()
82
+
83
+ messages = [
84
+ {"role": "system", "content": SYSTEM_PROMPT},
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {"type": "video", "video": frames},
89
+ {"type": "text", "text": f"Based on the system instructions, {prompt}"},
90
+ ],
91
+ }
92
+ ]
93
+
94
+ return generate_response(messages)
95
+
96
+ def generate_response(messages):
97
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
98
+ image_inputs, video_inputs = process_vision_info(messages)
99
+
100
+ inputs = processor(
101
+ text=[text],
102
+ images=image_inputs,
103
+ videos=video_inputs,
104
+ padding=True,
105
+ return_tensors="pt",
106
+ )
107
+
108
+ del image_inputs, video_inputs
109
+
110
+ with torch.no_grad():
111
+ generated_ids = model.generate(**inputs, max_new_tokens=512) # Increased token limit for more detailed responses
112
+ generated_ids_trimmed = [
113
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
114
+ ]
115
+ output_text = processor.batch_decode(
116
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
117
+ )
118
+
119
+ return output_text[0]
120
+
121
+ # Gradio interface
122
+ iface = gr.Interface(
123
+ fn=process_content,
124
+ inputs=[
125
+ gr.File(label="Upload Image or Video of Technical Content"),
126
+ gr.Textbox(label="Enter your technical question", placeholder="e.g., Identify any bugs in this code and suggest fixes", value="Analyze this technical content and provide insights.")
127
+ ],
128
+ outputs="text",
129
+ title="Technical Content Analysis",
130
+ description="Upload an image or video of code, diagrams, or technical content. Ask questions about bugs, errors, or explanations of functions.",
131
+ )
132
+
133
+ iface.launch(share=True)