echarlaix HF Staff commited on
Commit
bc0f349
·
1 Parent(s): b6f5018
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jpg filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Vision Langage Openvino
3
- emoji: 👀
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.35.0
8
  app_file: app.py
 
1
  ---
2
  title: Vision Langage Openvino
3
+ emoji: 📊
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.35.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
3
+ from threading import Thread
4
+ import re
5
+ import time
6
+
7
+ from optimum.intel import OVModelForVisualCausalLM
8
+
9
+
10
+
11
+ # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
12
+ model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
13
+
14
+ processor = AutoProcessor.from_pretrained(model_id)
15
+ model = OVModelForVisualCausalLM.from_pretrained(model_id)
16
+
17
+ def model_inference(input_dict, history, max_tokens):
18
+ text = input_dict["text"]
19
+ images = []
20
+ user_content = []
21
+ media_queue = []
22
+ if history == []:
23
+ text = input_dict["text"].strip()
24
+
25
+ for file in input_dict.get("files", []):
26
+ if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
27
+ media_queue.append({"type": "image", "path": file})
28
+ elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
29
+ media_queue.append({"type": "video", "path": file})
30
+
31
+ if "<image>" in text or "<video>" in text:
32
+ parts = re.split(r'(<image>|<video>)', text)
33
+ for part in parts:
34
+ if part == "<image>" and media_queue:
35
+ user_content.append(media_queue.pop(0))
36
+ elif part == "<video>" and media_queue:
37
+ user_content.append(media_queue.pop(0))
38
+ elif part.strip():
39
+ user_content.append({"type": "text", "text": part.strip()})
40
+ else:
41
+ user_content.append({"type": "text", "text": text})
42
+
43
+ for media in media_queue:
44
+ user_content.append(media)
45
+
46
+ resulting_messages = [{"role": "user", "content": user_content}]
47
+
48
+ elif len(history) > 0:
49
+ resulting_messages = []
50
+ user_content = []
51
+ media_queue = []
52
+ for hist in history:
53
+ if hist["role"] == "user" and isinstance(hist["content"], tuple):
54
+ file_name = hist["content"][0]
55
+ if file_name.endswith((".png", ".jpg", ".jpeg")):
56
+ media_queue.append({"type": "image", "path": file_name})
57
+ elif file_name.endswith(".mp4"):
58
+ media_queue.append({"type": "video", "path": file_name})
59
+
60
+
61
+ for hist in history:
62
+ if hist["role"] == "user" and isinstance(hist["content"], str):
63
+ text = hist["content"]
64
+ parts = re.split(r'(<image>|<video>)', text)
65
+
66
+ for part in parts:
67
+ if part == "<image>" and media_queue:
68
+ user_content.append(media_queue.pop(0))
69
+ elif part == "<video>" and media_queue:
70
+ user_content.append(media_queue.pop(0))
71
+ elif part.strip():
72
+ user_content.append({"type": "text", "text": part.strip()})
73
+
74
+ elif hist["role"] == "assistant":
75
+ resulting_messages.append({
76
+ "role": "user",
77
+ "content": user_content
78
+ })
79
+ resulting_messages.append({
80
+ "role": "assistant",
81
+ "content": [{"type": "text", "text": hist["content"]}]
82
+ })
83
+ user_content = []
84
+
85
+
86
+ if text == "" and not images:
87
+ gr.Error("Please input a query and optionally image(s).")
88
+
89
+ if text == "" and images:
90
+ gr.Error("Please input a text query along the images(s).")
91
+ print("resulting_messages", resulting_messages)
92
+ inputs = processor.apply_chat_template(
93
+ resulting_messages,
94
+ add_generation_prompt=True,
95
+ tokenize=True,
96
+ return_dict=True,
97
+ return_tensors="pt",
98
+ )
99
+
100
+ # Generate
101
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
102
+ generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
103
+ # generated_text = ""
104
+
105
+ thread = Thread(target=model.generate, kwargs=generation_args)
106
+ thread.start()
107
+
108
+ yield "..."
109
+ buffer = ""
110
+
111
+ for new_text in streamer:
112
+
113
+ buffer += new_text
114
+ # generated_text_without_prompt = buffer#[len(ext_buffer):]
115
+ time.sleep(0.01)
116
+ yield buffer
117
+
118
+
119
+ examples=[
120
+ [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
121
+ [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
122
+ [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
123
+ [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
124
+ [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
125
+ [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
126
+ ]
127
+ demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
128
+ description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
129
+ examples=examples,
130
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
131
+ cache_examples=False,
132
+ additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
133
+ type="messages"
134
+ )
135
+
136
+
137
+
138
+ demo.launch(debug=True)
139
+
example_images/campeones.jpg ADDED

Git LFS Details

  • SHA256: 0bb7318e890a7527f3c900531850d3f3b4786c6ae2c43939970e6884553e57ba
  • Pointer size: 131 Bytes
  • Size of remote file: 870 kB
example_images/document.jpg ADDED

Git LFS Details

  • SHA256: b1370554160136244d8aae0a75b6fa1e1dc4fd17a2834470c11578310fc6bbe3
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
example_images/dogs.jpg ADDED

Git LFS Details

  • SHA256: f651e3c654f74995a96c2da477670e1574e65c51e29c54c90937f37f9d91ce67
  • Pointer size: 130 Bytes
  • Size of remote file: 91.2 kB
example_images/examples_invoice.png ADDED

Git LFS Details

  • SHA256: 8964e903fe124c791f52992df1046aca5c298b0128c1b93bd03465faa7a00ac2
  • Pointer size: 130 Bytes
  • Size of remote file: 50 kB
example_images/examples_weather_events.png ADDED

Git LFS Details

  • SHA256: 443e28cba26ab4a08e2d4bcc311129c5818608ff8d4976c444bfcdd9918225ca
  • Pointer size: 131 Bytes
  • Size of remote file: 310 kB
example_images/fiche.jpg ADDED

Git LFS Details

  • SHA256: c74cb48d470d058eea7aa79e699bfe542482c7c233b384556b66b111e2e41708
  • Pointer size: 132 Bytes
  • Size of remote file: 1.76 MB
example_images/math.jpg ADDED

Git LFS Details

  • SHA256: 1367078c5125dc80bd47805ef223658cc5df6fdc791ad310e0483ff86567080d
  • Pointer size: 130 Bytes
  • Size of remote file: 16.2 kB
example_images/mosque.jpg ADDED

Git LFS Details

  • SHA256: 3268d8a33260bde4b7731f0c1d5bdfe3aa6d2236eed99fa7047ea844c7af1806
  • Pointer size: 131 Bytes
  • Size of remote file: 944 kB
example_images/newyork.jpg ADDED

Git LFS Details

  • SHA256: 8eec301d6c61741bd4041965788f2f9efda12722f3672d1605d4c01055405c8a
  • Pointer size: 131 Bytes
  • Size of remote file: 557 kB
example_images/rococo.jpg ADDED

Git LFS Details

  • SHA256: a2ef3caa7c1214cb194d7b4cf6665e5f51617a0b15aa0c87c4db9d06b44f0241
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB
example_images/rococo_1.jpg ADDED

Git LFS Details

  • SHA256: cac67d79c545d03a2813fd717670469d394e8757d5090c45e536e3deee7ab791
  • Pointer size: 131 Bytes
  • Size of remote file: 849 kB
example_images/s2w_example.png ADDED

Git LFS Details

  • SHA256: 02fa71fb0761ccf26860a0476f7b66ff518421e4624e9132bb6654604eb8a0b6
  • Pointer size: 130 Bytes
  • Size of remote file: 82.8 kB
example_images/short.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df3c33fe94db5e7406f86e7d94c4ec8804124167f772912ce58dff17dc79440
3
+ size 4028661
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ huggingface_hub
2
+ gradio
3
+ optimum-intel[openvino]==1.24
4
+ nncf