Spaces:
Runtime error
Runtime error
Try open source model
Browse files- requirements.txt +1 -0
- videoinput/query.py +28 -11
requirements.txt
CHANGED
@@ -11,3 +11,4 @@ requests
|
|
11 |
torch
|
12 |
transformers
|
13 |
pillow
|
|
|
|
11 |
torch
|
12 |
transformers
|
13 |
pillow
|
14 |
+
requests
|
videoinput/query.py
CHANGED
@@ -2,6 +2,17 @@ from typing import Callable, Optional
|
|
2 |
|
3 |
import dotenv
|
4 |
from openai import AsyncOpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
from .input import decode_input
|
7 |
from .utils import NamedTemporaryFile, file_to_data_uri, timed
|
@@ -71,20 +82,14 @@ async def process_video(
|
|
71 |
images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
|
72 |
|
73 |
callback("Querying")
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
{
|
78 |
"role": "user",
|
79 |
"content": [
|
80 |
{"type": "text", "text": transcription.text},
|
81 |
-
*[
|
82 |
-
{
|
83 |
-
"type": "image_url",
|
84 |
-
"image_url": {"url": image, "detail": "auto"},
|
85 |
-
}
|
86 |
-
for image in images
|
87 |
-
],
|
88 |
],
|
89 |
},
|
90 |
{
|
@@ -97,13 +102,25 @@ async def process_video(
|
|
97 |
],
|
98 |
},
|
99 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
)
|
|
|
101 |
|
102 |
callback("Converting to speech")
|
103 |
audio = await client.audio.speech.create(
|
104 |
model="tts-1",
|
105 |
voice="nova",
|
106 |
-
input=
|
107 |
response_format="mp3",
|
108 |
)
|
109 |
|
|
|
2 |
|
3 |
import dotenv
|
4 |
from openai import AsyncOpenAI
|
5 |
+
import requests
|
6 |
+
import torch
|
7 |
+
from PIL import Image
|
8 |
+
from io import BytesIO
|
9 |
+
|
10 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
11 |
+
from transformers.image_utils import load_image
|
12 |
+
|
13 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
+
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
|
15 |
+
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(DEVICE)
|
16 |
|
17 |
from .input import decode_input
|
18 |
from .utils import NamedTemporaryFile, file_to_data_uri, timed
|
|
|
82 |
images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
|
83 |
|
84 |
callback("Querying")
|
85 |
+
|
86 |
+
prompt = processor.apply_chat_template(
|
87 |
+
[
|
88 |
{
|
89 |
"role": "user",
|
90 |
"content": [
|
91 |
{"type": "text", "text": transcription.text},
|
92 |
+
*[{"type": "image"} for image in images],
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
],
|
94 |
},
|
95 |
{
|
|
|
102 |
],
|
103 |
},
|
104 |
],
|
105 |
+
add_generation_prompt=True,
|
106 |
+
)
|
107 |
+
inputs = processor(
|
108 |
+
text=prompt,
|
109 |
+
images=[load_image(image) for image in images],
|
110 |
+
return_tensors="pt",
|
111 |
+
)
|
112 |
+
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
113 |
+
generated_ids = model.generate(**inputs, max_new_tokens=500)
|
114 |
+
generated_texts = processor.batch_decode(
|
115 |
+
generated_ids, skip_special_tokens=True
|
116 |
)
|
117 |
+
print("".join(generated_texts))
|
118 |
|
119 |
callback("Converting to speech")
|
120 |
audio = await client.audio.speech.create(
|
121 |
model="tts-1",
|
122 |
voice="nova",
|
123 |
+
input="".join(generated_texts),
|
124 |
response_format="mp3",
|
125 |
)
|
126 |
|