Spaces:

whyumesh
/

eterniq_vision_to_code

Sleeping

App Files Files Community

whyumesh commited on Oct 1, 2024

Commit

b3c78b4

verified ·

1 Parent(s): d15f3b3

Create app.py

Browse files

Files changed (1) hide show

app.py +133 -0

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import gradio as gr
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+from PIL import Image
+import cv2
+import numpy as np
+import os
+def load_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float32
+    )
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+    return model, processor
+model, processor = load_model()
+SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
+1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
+2. If you see a function or algorithm, explain its purpose and how it works.
+3. If you see a technical diagram or flowchart, interpret its meaning and purpose.
+4. For any technical content, provide detailed explanations and insights.
+Always maintain a professional and technical tone in your responses."""
+def process_content(file, user_prompt):
+    if file is None:
+        return "No content provided. Please upload an image or video of technical content."
+    file_path = file.name
+    file_extension = os.path.splitext(file_path)[1].lower()
+    if file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
+        image = Image.open(file_path)
+        return analyze_image(image, user_prompt)
+    elif file_extension in ['.mp4', '.avi', '.mov']:
+        return analyze_video(file_path, user_prompt)
+    else:
+        return "Unsupported file type. Please provide an image (jpg, jpeg, png, bmp) or video (mp4, avi, mov) of technical content."
+def analyze_image(image, prompt):
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"Based on the system instructions, {prompt}"},
+            ],
+        }
+    ]
+    return generate_response(messages)
+def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    frame_count = 0
+    while len(frames) < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % frame_interval == 0:
+            h, w = frame.shape[:2]
+            if h > w:
+                new_h, new_w = max_resolution, int(w * max_resolution / h)
+            else:
+                new_h, new_w = int(h * max_resolution / w), max_resolution
+            frame = cv2.resize(frame, (new_w, new_h))
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+        frame_count += 1
+    cap.release()
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "video": frames},
+                {"type": "text", "text": f"Based on the system instructions, {prompt}"},
+            ],
+        }
+    ]
+    return generate_response(messages)
+def generate_response(messages):
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    del image_inputs, video_inputs
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=512)  # Increased token limit for more detailed responses
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+# Gradio interface
+iface = gr.Interface(
+    fn=process_content,
+    inputs=[
+        gr.File(label="Upload Image or Video of Technical Content"),
+        gr.Textbox(label="Enter your technical question", placeholder="e.g., Identify any bugs in this code and suggest fixes", value="Analyze this technical content and provide insights.")
+    ],
+    outputs="text",
+    title="Technical Content Analysis",
+    description="Upload an image or video of code, diagrams, or technical content. Ask questions about bugs, errors, or explanations of functions.",
+)
+iface.launch(share=True)