prithivMLmods commited on
Commit
fe53594
·
verified ·
1 Parent(s): 95550be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -5,7 +5,6 @@ from threading import Thread
5
  import time
6
  import torch
7
  import spaces
8
- from qwen_vl_utils import process_vision_info
9
 
10
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
11
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -56,18 +55,14 @@ def model_inference(input_dict, history):
56
  }
57
  ]
58
 
59
- # Process vision info (images and videos)
60
- image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
61
-
62
  # Apply chat template and process inputs
63
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
64
  inputs = processor(
65
  text=[prompt],
66
- images=image_inputs,
67
- videos=video_inputs,
68
- padding=True,
69
  return_tensors="pt",
70
- **video_kwargs,
71
  ).to("cuda")
72
 
73
  # Set up streamer for real-time output
@@ -90,7 +85,6 @@ def model_inference(input_dict, history):
90
 
91
  # Example inputs
92
  examples = [
93
- [{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
94
  [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
95
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
96
  [{"text": "Describe the photo", "files": ["examples/3.png"]}],
@@ -101,7 +95,7 @@ examples = [
101
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
102
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
103
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
104
-
105
  ]
106
 
107
  demo = gr.ChatInterface(
@@ -114,4 +108,4 @@ demo = gr.ChatInterface(
114
  cache_examples=False,
115
  )
116
 
117
- demo.launch(debug=True)
 
5
  import time
6
  import torch
7
  import spaces
 
8
 
9
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
10
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 
55
  }
56
  ]
57
 
 
 
 
58
  # Apply chat template and process inputs
59
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
  inputs = processor(
61
  text=[prompt],
62
+ images=images if images else None,
63
+ videos=videos if videos else None,
 
64
  return_tensors="pt",
65
+ padding=True,
66
  ).to("cuda")
67
 
68
  # Set up streamer for real-time output
 
85
 
86
  # Example inputs
87
  examples = [
 
88
  [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
89
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
90
  [{"text": "Describe the photo", "files": ["examples/3.png"]}],
 
95
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
96
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
97
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
98
+ [{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
99
  ]
100
 
101
  demo = gr.ChatInterface(
 
108
  cache_examples=False,
109
  )
110
 
111
+ demo.launch(debug=True, share=True)