prithivMLmods commited on
Commit
905acb6
·
verified ·
1 Parent(s): 1f4c6de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -303
app.py CHANGED
@@ -2,329 +2,268 @@ import os
2
  import random
3
  import uuid
4
  import json
 
 
 
5
 
6
  import gradio as gr
7
- import numpy as np
8
- from PIL import Image
9
  import spaces
10
  import torch
 
 
 
11
 
12
- from diffusers import DiffusionPipeline
13
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler # EulerAncestralDiscreteScheduler not explicitly used but imported
14
- from typing import Tuple
15
-
16
- bad_words = json.loads(os.getenv('BAD_WORDS', "[]"))
17
- bad_words_negative = json.loads(os.getenv('BAD_WORDS_NEGATIVE', "[]"))
18
- default_negative = os.getenv("default_negative","")
19
-
20
- def check_text(prompt, negative=""):
21
- for i in bad_words:
22
- if i in prompt:
23
- return True
24
- for i in bad_words_negative:
25
- if i in negative:
26
- return True
27
- return False
28
-
29
- style_list = [
30
- {
31
- "name": "Photo",
32
- "prompt": "cinematic photo {prompt}. 35mm photograph, film, bokeh, professional, 4k, highly detailed",
33
- "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
34
- },
35
- {
36
- "name": "Cinematic",
37
- "prompt": "cinematic still {prompt}. emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
38
- "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
39
- },
40
- {
41
- "name": "Anime",
42
- "prompt": "anime artwork {prompt}. anime style, key visual, vibrant, studio anime, highly detailed",
43
- "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
44
- },
45
- {
46
- "name": "3D Model",
47
- "prompt": "professional 3d model {prompt}. octane render, highly detailed, volumetric, dramatic lighting",
48
- "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
49
- },
50
- {
51
- "name": "(No style)",
52
- "prompt": "{prompt}",
53
- "negative_prompt": "",
54
- },
55
- ]
56
-
57
- DESCRIPTION = """##
58
-
59
-
60
- """
61
-
62
- styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
63
- STYLE_NAMES = list(styles.keys())
64
- DEFAULT_STYLE_NAME = "Photo"
65
-
66
- def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
67
- p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
68
- if not negative:
69
- negative = ""
70
- return p.replace("{prompt}", positive), n + negative
71
-
72
- if not torch.cuda.is_available():
73
- DESCRIPTION += "\n<p>⚠️Running on CPU, This may not work on CPU.</p>"
74
 
75
- MAX_SEED = np.iinfo(np.int32).max
76
- CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "0") == "1"
77
- MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
78
- USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
79
- ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
80
 
81
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
82
 
83
- NUM_IMAGES_PER_PROMPT = 1
84
-
85
- if torch.cuda.is_available():
86
- pipe = StableDiffusionXLPipeline.from_pretrained(
87
- "SG161222/RealVisXL_V5.0_Lightning",
88
- torch_dtype=torch.float16,
89
- use_safetensors=True,
90
- add_watermarker=False,
91
- variant="fp16"
92
- )
93
- pipe2 = StableDiffusionXLPipeline.from_pretrained(
94
- "SG161222/RealVisXL_V4.0_Lightning",
95
- torch_dtype=torch.float16,
96
- use_safetensors=True,
97
- add_watermarker=False,
98
- variant="fp16"
99
- )
100
- if ENABLE_CPU_OFFLOAD:
101
- pipe.enable_model_cpu_offload()
102
- pipe2.enable_model_cpu_offload()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  else:
104
- pipe.to(device)
105
- pipe2.to(device)
106
- print("Loaded on Device!")
107
-
108
- if USE_TORCH_COMPILE:
109
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
110
- pipe2.unet = torch.compile(pipe2.unet, mode="reduce-overhead", fullgraph=True)
111
- print("Model Compiled!")
112
-
113
- def save_image(img):
114
- unique_name = str(uuid.uuid4()) + ".png"
115
- img.save(unique_name)
116
- return unique_name
117
-
118
- def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
119
- if randomize_seed:
120
- seed = random.randint(0, MAX_SEED)
121
- return seed
122
-
123
- @spaces.GPU(duration=30)
124
- @torch.no_grad()
125
- def generate(
126
- prompt: str,
127
- negative_prompt: str = "",
128
- use_negative_prompt: bool = False,
129
- style: str = DEFAULT_STYLE_NAME,
130
- seed: int = 0,
131
- width: int = 1024,
132
- height: int = 1024,
133
- guidance_scale: float = 3,
134
- randomize_seed: bool = False,
135
- use_resolution_binning: bool = True, # This parameter is not exposed in the UI by default
136
- progress=gr.Progress(track_tqdm=True),
137
- ):
138
- if check_text(prompt, negative_prompt):
139
- raise ValueError("Prompt contains restricted words.")
140
-
141
- prompt, negative_prompt_from_style = apply_style(style, prompt, "") # Apply style positive first
142
-
143
- # Combine negative prompts
144
- if use_negative_prompt:
145
- final_negative_prompt = negative_prompt_from_style + " " + negative_prompt + " " + default_negative
 
 
 
 
 
 
 
 
 
 
146
  else:
147
- final_negative_prompt = negative_prompt_from_style + " " + default_negative
148
- final_negative_prompt = final_negative_prompt.strip()
149
-
150
-
151
- seed = int(randomize_seed_fn(seed, randomize_seed))
152
- generator = torch.Generator(device=device).manual_seed(seed) # Ensure generator is on the correct device
153
-
154
- options = {
155
- "prompt": prompt,
156
- "negative_prompt": final_negative_prompt,
157
- "width": width,
158
- "height": height,
159
- "guidance_scale": guidance_scale,
160
- "num_inference_steps": 25, # This is hardcoded, UI slider for steps is not connected
161
- "generator": generator,
162
- "num_images_per_prompt": NUM_IMAGES_PER_PROMPT, # UI slider for images is not connected to this
163
- # "use_resolution_binning": use_resolution_binning, # This was in original code, but not defined. Diffusers handles it.
164
- "output_type": "pil",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  }
166
-
167
- # If on CPU, ensure generator is for CPU
168
- if device.type == 'cpu':
169
- generator = torch.Generator(device='cpu').manual_seed(seed)
170
- options["generator"] = generator
171
-
172
- images = []
173
- if 'pipe' in globals(): # Check if pipes are loaded (i.e. on GPU)
174
- images.extend(pipe(**options).images)
175
- images.extend(pipe2(**options).images)
176
- else: # Fallback for CPU or if pipes are not loaded (though the DESCRIPTION warns about CPU)
177
- # This part would need a CPU-compatible pipeline if one isn't loaded.
178
- # For now, it will likely error if pipe/pipe2 aren't available.
179
- # Or, we can return a placeholder or raise a specific error.
180
- # To prevent errors if running without GPU and models didn't load:
181
- placeholder_image = Image.new('RGB', (width, height), color = 'grey')
182
- draw = ImageDraw.Draw(placeholder_image)
183
- draw.text((10, 10), "GPU models not loaded. Cannot generate image.", fill=(255,0,0))
184
- images.append(placeholder_image)
185
-
186
-
187
- image_paths = [save_image(img) for img in images]
188
- return image_paths, seed
189
 
190
- examples = [
191
- "3d image, cute girl, in the style of Pixar --ar 1:2 --stylize 750, 4K resolution highlights, Sharp focus, octane render, ray tracing, Ultra-High-Definition, 8k, UHD, HDR, (Masterpiece:1.5), (best quality:1.5)",
 
192
  ]
193
 
194
- css = '''
195
- .gradio-container {
196
- max-width: 590px !important; /* Existing style */
197
- margin: 0 auto !important; /* Existing style */
198
- }
199
- h1 {
200
- text-align: center; /* Existing style */
201
  }
202
- footer {
203
- visibility: hidden; /* Existing style */
204
  }
205
- '''
206
- with gr.Blocks(css=css) as demo:
207
- gr.Markdown(DESCRIPTION)
208
- with gr.Row():
209
- prompt = gr.Text(
210
- show_label=False,
211
- max_lines=1,
212
- placeholder="Enter your prompt",
213
- container=False,
214
- )
215
- run_button = gr.Button("Run", scale=0, variant="primary")
216
- result = gr.Gallery(label="Result", columns=1, preview=True) # columns=1 for single image below each other if multiple
217
 
218
- with gr.Accordion("Advanced options", open=False):
219
- style_selection = gr.Dropdown( # MODIFIED: Was gr.Radio, moved into accordion
220
- label="Image Style",
221
- choices=STYLE_NAMES,
222
- value=DEFAULT_STYLE_NAME,
223
- interactive=True,
224
- show_label=True,
225
- container=True,
226
- )
227
- use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=True, visible=True)
228
- negative_prompt = gr.Text(
229
- label="Negative prompt",
230
- max_lines=1,
231
- placeholder="Enter a negative prompt (appended to style's negative)",
232
- value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck",
233
- visible=True,
234
- )
235
- # Note: num_inference_steps and num_images_per_prompt sliders are defined in UI
236
- # but not wired to the generate function's parameters that control these aspects.
237
- # Keeping them as is, per "Don't alter the remaining functionality".
238
- with gr.Row():
239
- num_inference_steps = gr.Slider( # This UI element is not connected to the backend
240
- label="Steps (Not Connected)",
241
- minimum=10,
242
- maximum=60,
243
- step=1,
244
- value=20, # Default value in UI
245
- )
246
- with gr.Row():
247
- num_images_per_prompt = gr.Slider( # This UI element is not connected to the backend
248
- label="Images (Not Connected)",
249
- minimum=1,
250
- maximum=4,
251
- step=1,
252
- value=2, # Default value in UI (backend NUM_IMAGES_PER_PROMPT is 1, resulting in 2 total)
253
- )
254
- seed = gr.Slider(
255
- label="Seed",
256
- minimum=0,
257
- maximum=MAX_SEED,
258
- step=1,
259
- value=0,
260
- visible=True
261
- )
262
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
263
- with gr.Row(visible=True):
264
- width = gr.Slider(
265
- label="Width",
266
- minimum=512,
267
- maximum=MAX_IMAGE_SIZE, # Use MAX_IMAGE_SIZE
268
- step=8,
269
- value=1024,
270
- )
271
- height = gr.Slider(
272
- label="Height",
273
- minimum=512,
274
- maximum=MAX_IMAGE_SIZE, # Use MAX_IMAGE_SIZE
275
- step=8,
276
- value=1024,
277
- )
278
- with gr.Row():
279
- guidance_scale = gr.Slider(
280
- label="Guidance Scale",
281
- minimum=0.1,
282
- maximum=20.0,
283
- step=0.1,
284
- value=3.0,
285
  )
286
-
287
- # Original style_selection gr.Row has been removed from here.
288
-
289
- gr.Examples(
290
- examples=examples,
291
- inputs=prompt,
292
- outputs=[result, seed], # seed output is good for reproducibility
293
- fn=generate,
294
- cache_examples=CACHE_EXAMPLES,
 
295
  )
296
-
297
- use_negative_prompt.change(
298
- fn=lambda x: gr.update(visible=x),
299
- inputs=use_negative_prompt,
300
- outputs=negative_prompt,
301
- api_name=False,
302
- )
303
-
304
- gr.on(
305
- triggers=[
306
- prompt.submit,
307
- negative_prompt.submit, # Allow submitting negative prompt to trigger run
308
- run_button.click,
309
- ],
310
- fn=generate,
311
- inputs=[
312
- prompt,
313
- negative_prompt,
314
- use_negative_prompt,
315
- style_selection, # style_selection is correctly in inputs
316
- seed,
317
- width,
318
- height,
319
- guidance_scale,
320
- randomize_seed,
321
- ],
322
- outputs=[result, seed],
323
- api_name="run",
324
  )
325
 
326
  if __name__ == "__main__":
327
-
328
- from PIL import ImageDraw # Add ImageDraw import for CPU placeholder
329
-
330
- demo.queue(max_size=20).launch(ssr_mode=True, show_error=True, share=True)
 
2
  import random
3
  import uuid
4
  import json
5
+ import time
6
+ import asyncio
7
+ from threading import Thread
8
 
9
  import gradio as gr
 
 
10
  import spaces
11
  import torch
12
+ import numpy as np
13
+ from PIL import Image
14
+ import cv2
15
 
16
+ from transformers import (
17
+ Qwen2_5_VLForConditionalGeneration,
18
+ AutoProcessor,
19
+ TextIteratorStreamer,
20
+ )
21
+ from transformers.image_utils import load_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # Constants for text generation
24
+ MAX_MAX_NEW_TOKENS = 2048
25
+ DEFAULT_MAX_NEW_TOKENS = 1024
26
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
27
 
28
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
29
 
30
+ # Load Cosmos-Reason1-7B
31
+ MODEL_ID_M = "nvidia/Cosmos-Reason1-7B"
32
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
33
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
+ MODEL_ID_M,
35
+ trust_remote_code=True,
36
+ torch_dtype=torch.float16
37
+ ).to(device).eval()
38
+
39
+ # Load DocScope
40
+ MODEL_ID_X = "prithivMLmods/docscopeOCR-7B-050425-exp"
41
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
42
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
+ MODEL_ID_X,
44
+ trust_remote_code=True,
45
+ torch_dtype=torch.float16
46
+ ).to(device).eval()
47
+
48
+ # Load Relaxed
49
+ MODEL_ID_Z = "Ertugrul/Qwen2.5-VL-7B-Captioner-Relaxed"
50
+ processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
51
+ model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
52
+ MODEL_ID_Z,
53
+ trust_remote_code=True,
54
+ torch_dtype=torch.float16
55
+ ).to(device).eval()
56
+
57
+ def downsample_video(video_path):
58
+ """
59
+ Downsamples the video to evenly spaced frames.
60
+ Each frame is returned as a PIL image along with its timestamp.
61
+ """
62
+ vidcap = cv2.VideoCapture(video_path)
63
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
64
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
65
+ frames = []
66
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
67
+ for i in frame_indices:
68
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
69
+ success, image = vidcap.read()
70
+ if success:
71
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
72
+ pil_image = Image.fromarray(image)
73
+ timestamp = round(i / fps, 2)
74
+ frames.append((pil_image, timestamp))
75
+ vidcap.release()
76
+ return frames
77
+
78
+ @spaces.GPU
79
+ def generate_image(model_name: str, text: str, image: Image.Image,
80
+ max_new_tokens: int = 1024,
81
+ temperature: float = 0.6,
82
+ top_p: float = 0.9,
83
+ top_k: int = 50,
84
+ repetition_penalty: float = 1.2):
85
+ """
86
+ Generates responses using the selected model for image input.
87
+ """
88
+ if model_name == "Cosmos-Reason1-7B":
89
+ processor = processor_m
90
+ model = model_m
91
+ elif model_name == "docscopeOCR-7B-050425-exp":
92
+ processor = processor_x
93
+ model = model_x
94
+ elif model_name == "Captioner-7B":
95
+ processor = processor_z
96
+ model = model_z
97
  else:
98
+ yield "Invalid model selected."
99
+ return
100
+
101
+ if image is None:
102
+ yield "Please upload an image."
103
+ return
104
+
105
+ messages = [{
106
+ "role": "user",
107
+ "content": [
108
+ {"type": "image", "image": image},
109
+ {"type": "text", "text": text},
110
+ ]
111
+ }]
112
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
113
+ inputs = processor(
114
+ text=[prompt_full],
115
+ images=[image],
116
+ return_tensors="pt",
117
+ padding=True,
118
+ truncation=False,
119
+ max_length=MAX_INPUT_TOKEN_LENGTH
120
+ ).to(device)
121
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
122
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
123
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
124
+ thread.start()
125
+ buffer = ""
126
+ for new_text in streamer:
127
+ buffer += new_text
128
+ time.sleep(0.01)
129
+ yield buffer
130
+
131
+ @spaces.GPU
132
+ def generate_video(model_name: str, text: str, video_path: str,
133
+ max_new_tokens: int = 1024,
134
+ temperature: float = 0.6,
135
+ top_p: float = 0.9,
136
+ top_k: int = 50,
137
+ repetition_penalty: float = 1.2):
138
+ """
139
+ Generates responses using the selected model for video input.
140
+ """
141
+ if model_name == "Cosmos-Reason1-7B":
142
+ processor = processor_m
143
+ model = model_m
144
+ elif model_name == "docscopeOCR-7B-050425-exp":
145
+ processor = processor_x
146
+ model = model_x
147
+ elif model_name == "Captioner-7B":
148
+ processor = processor_z
149
+ model = model_z
150
  else:
151
+ yield "Invalid model selected."
152
+ return
153
+
154
+ if video_path is None:
155
+ yield "Please upload a video."
156
+ return
157
+
158
+ frames = downsample_video(video_path)
159
+ messages = [
160
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
161
+ {"role": "user", "content": [{"type": "text", "text": text}]}
162
+ ]
163
+ for frame in frames:
164
+ image, timestamp = frame
165
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
166
+ messages[1]["content"].append({"type": "image", "image": image})
167
+ inputs = processor.apply_chat_template(
168
+ messages,
169
+ tokenize=True,
170
+ add_generation_prompt=True,
171
+ return_dict=True,
172
+ return_tensors="pt",
173
+ truncation=False,
174
+ max_length=MAX_INPUT_TOKEN_LENGTH
175
+ ).to(device)
176
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
177
+ generation_kwargs = {
178
+ **inputs,
179
+ "streamer": streamer,
180
+ "max_new_tokens": max_new_tokens,
181
+ "do_sample": True,
182
+ "temperature": temperature,
183
+ "top_p": top_p,
184
+ "top_k": top_k,
185
+ "repetition_penalty": repetition_penalty,
186
  }
187
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
188
+ thread.start()
189
+ buffer = ""
190
+ for new_text in streamer:
191
+ buffer += new_text
192
+ time.sleep(0.01)
193
+ yield buffer
194
+
195
+ # Define examples for image and video inference
196
+ image_examples = [
197
+ ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
198
+ ["count the number of birds and explain the scene in detail.", "images/2.jpg"]
199
+ ]
 
 
 
 
 
 
 
 
 
 
200
 
201
+ video_examples = [
202
+ ["give the highlights of the movie scene video.", "videos/1.mp4"],
203
+ ["explain the advertisement in detail.", "videos/2.mp4"]
204
  ]
205
 
206
+ css = """
207
+ .submit-btn {
208
+ background-color: #2980b9 !important;
209
+ color: white !important;
 
 
 
210
  }
211
+ .submit-btn:hover {
212
+ background-color: #3498db !important;
213
  }
214
+ """
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ # Create the Gradio Interface
217
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
218
+ gr.Markdown("# **VisionScope-R2**")
219
+ with gr.Row():
220
+ with gr.Column():
221
+ with gr.Tabs():
222
+ with gr.TabItem("Image Inference"):
223
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
224
+ image_upload = gr.Image(type="pil", label="Image")
225
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
226
+ gr.Examples(
227
+ examples=image_examples,
228
+ inputs=[image_query, image_upload]
229
+ )
230
+ with gr.TabItem("Video Inference"):
231
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
232
+ video_upload = gr.Video(label="Video")
233
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
234
+ gr.Examples(
235
+ examples=video_examples,
236
+ inputs=[video_query, video_upload]
237
+ )
238
+ with gr.Accordion("Advanced options", open=False):
239
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
240
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
241
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
242
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
243
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
244
+ with gr.Column():
245
+ output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
246
+ model_choice = gr.Radio(
247
+ choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp", "Captioner-7B"],
248
+ label="Select Model",
249
+ value="Cosmos-Reason1-7B"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  )
251
+
252
+ gr.Markdown("**Model Info**")
253
+ gr.Markdown("⤷ [Cosmos-Reason1-7B](https://huggingface.co/nvidia/Cosmos-Reason1-7B): understand physical common sense and generate appropriate embodied decisions.")
254
+ gr.Markdown("⤷ [docscopeOCR-7B-050425-exp](https://huggingface.co/prithivMLmods/docscopeOCR-7B-050425-exp): optimized for document-level optical character recognition, long-context vision-language understanding.")
255
+ gr.Markdown("⤷ [Captioner-Relaxed-7B](https://huggingface.co/Ertugrul/Qwen2.5-VL-7B-Captioner-Relaxed): build with hand-curated dataset for text-to-image models, providing significantly more detailed descriptions or captions of given images.")
256
+
257
+ image_submit.click(
258
+ fn=generate_image,
259
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
260
+ outputs=output
261
  )
262
+ video_submit.click(
263
+ fn=generate_video,
264
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
265
+ outputs=output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  )
267
 
268
  if __name__ == "__main__":
269
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)