prithivMLmods commited on
Commit
d9dde0d
·
verified ·
1 Parent(s): 0b5bfb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -43
app.py CHANGED
@@ -6,8 +6,10 @@ import time
6
  import torch
7
  import spaces
8
  import subprocess
 
 
 
9
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
10
- subprocess.run("pip install av", shell=True) # Install pyav for video processing
11
 
12
  from io import BytesIO
13
 
@@ -17,50 +19,100 @@ model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B
17
  torch_dtype=torch.bfloat16).to("cuda:0")
18
 
19
  @spaces.GPU
20
- def model_inference(input_dict, history, max_tokens):
 
 
21
  text = input_dict["text"]
22
- media_queue = []
23
  user_content = []
24
-
25
- for file in input_dict.get("files", []):
26
- if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
27
- media_queue.append({"type": "image", "path": file})
28
- elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
29
- media_queue.append({"type": "video", "path": file})
 
 
 
30
 
31
- if "<image>" in text or "<video>" in text:
32
- parts = re.split(r'(<image>|<video>)', text)
33
- for part in parts:
34
- if part == "<image>" and media_queue:
35
- user_content.append(media_queue.pop(0))
36
- elif part == "<video>" and media_queue:
37
- user_content.append(media_queue.pop(0))
38
- elif part.strip():
39
- user_content.append({"type": "text", "text": part.strip()})
40
- else:
41
- user_content.append({"type": "text", "text": text})
42
- user_content.extend(media_queue)
 
 
43
 
44
- resulting_messages = [{"role": "user", "content": user_content}]
45
-
46
- if not text and not media_queue:
47
- return "Please provide text and/or media input."
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  inputs = processor.apply_chat_template(
50
  resulting_messages,
51
  add_generation_prompt=True,
52
  tokenize=True,
53
  return_dict=True,
54
  return_tensors="pt",
55
- ).to(model.device)
 
 
56
 
 
57
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
58
  generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
59
-
 
60
  thread = Thread(target=model.generate, kwargs=generation_args)
61
  thread.start()
62
-
63
- yield "Generating response..."
64
  buffer = ""
65
 
66
  for new_text in streamer:
@@ -68,17 +120,12 @@ def model_inference(input_dict, history, max_tokens):
68
  time.sleep(0.01)
69
  yield buffer
70
 
 
 
 
 
 
 
 
71
 
72
- demo = gr.ChatInterface(
73
- fn=model_inference,
74
- title="SmolVLM2: The Smallest Video Model Ever 📺",
75
- description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
76
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
77
- stop_btn="Stop Generation",
78
- multimodal=True,
79
- cache_examples=False,
80
- additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
81
- type="messages"
82
- )
83
-
84
- demo.launch(share=True, debug=True)
 
6
  import torch
7
  import spaces
8
  import subprocess
9
+
10
+ # Ensure pyav is installed
11
+ subprocess.run('pip install pyav', shell=True, check=True)
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
13
 
14
  from io import BytesIO
15
 
 
19
  torch_dtype=torch.bfloat16).to("cuda:0")
20
 
21
  @spaces.GPU
22
+ def model_inference(
23
+ input_dict, history, max_tokens
24
+ ):
25
  text = input_dict["text"]
26
+ images = []
27
  user_content = []
28
+ media_queue = []
29
+ if history == []:
30
+ text = input_dict["text"].strip()
31
+
32
+ for file in input_dict.get("files", []):
33
+ if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
34
+ media_queue.append({"type": "image", "path": file})
35
+ elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
36
+ media_queue.append({"type": "video", "path": file})
37
 
38
+ if "<image>" in text or "<video>" in text:
39
+ parts = re.split(r'(<image>|<video>)', text)
40
+ for part in parts:
41
+ if part == "<image>" and media_queue:
42
+ user_content.append(media_queue.pop(0))
43
+ elif part == "<video>" and media_queue:
44
+ user_content.append(media_queue.pop(0))
45
+ elif part.strip():
46
+ user_content.append({"type": "text", "text": part.strip()})
47
+ else:
48
+ user_content.append({"type": "text", "text": text})
49
+
50
+ for media in media_queue:
51
+ user_content.append(media)
52
 
53
+ resulting_messages = [{"role": "user", "content": user_content}]
54
+
55
+ elif len(history) > 0:
56
+ resulting_messages = []
57
+ user_content = []
58
+ media_queue = []
59
+ for hist in history:
60
+ if hist["role"] == "user" and isinstance(hist["content"], tuple):
61
+ file_name = hist["content"][0]
62
+ if file_name.endswith((".png", ".jpg", ".jpeg")):
63
+ media_queue.append({"type": "image", "path": file_name})
64
+ elif file_name.endswith(".mp4"):
65
+ media_queue.append({"type": "video", "path": file_name})
66
+
67
+ for hist in history:
68
+ if hist["role"] == "user" and isinstance(hist["content"], str):
69
+ text = hist["content"]
70
+ parts = re.split(r'(<image>|<video>)', text)
71
+
72
+ for part in parts:
73
+ if part == "<image>" and media_queue:
74
+ user_content.append(media_queue.pop(0))
75
+ elif part == "<video>" and media_queue:
76
+ user_content.append(media_queue.pop(0))
77
+ elif part.strip():
78
+ user_content.append({"type": "text", "text": part.strip()})
79
+
80
+ elif hist["role"] == "assistant":
81
+ resulting_messages.append({
82
+ "role": "user",
83
+ "content": user_content
84
+ })
85
+ resulting_messages.append({
86
+ "role": "assistant",
87
+ "content": [{"type": "text", "text": hist["content"]}]
88
+ })
89
+ user_content = []
90
+
91
+ if text == "" and not images:
92
+ gr.Error("Please input a query and optionally image(s).")
93
+
94
+ if text == "" and images:
95
+ gr.Error("Please input a text query along the images(s).")
96
+ print("resulting_messages", resulting_messages)
97
  inputs = processor.apply_chat_template(
98
  resulting_messages,
99
  add_generation_prompt=True,
100
  tokenize=True,
101
  return_dict=True,
102
  return_tensors="pt",
103
+ )
104
+
105
+ inputs = inputs.to(model.device)
106
 
107
+ # Generate
108
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
109
  generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
110
+ generated_text = ""
111
+
112
  thread = Thread(target=model.generate, kwargs=generation_args)
113
  thread.start()
114
+
115
+ yield "..."
116
  buffer = ""
117
 
118
  for new_text in streamer:
 
120
  time.sleep(0.01)
121
  yield buffer
122
 
123
+ demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
124
+ description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
125
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
126
+ cache_examples=False,
127
+ additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
128
+ type="messages"
129
+ )
130
 
131
+ demo.launch(debug=True, share=True)