jcheng5 commited on
Commit
7a5b32d
·
1 Parent(s): c89cb3e

Try open source model

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. videoinput/query.py +28 -11
requirements.txt CHANGED
@@ -11,3 +11,4 @@ requests
11
  torch
12
  transformers
13
  pillow
 
 
11
  torch
12
  transformers
13
  pillow
14
+ requests
videoinput/query.py CHANGED
@@ -2,6 +2,17 @@ from typing import Callable, Optional
2
 
3
  import dotenv
4
  from openai import AsyncOpenAI
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  from .input import decode_input
7
  from .utils import NamedTemporaryFile, file_to_data_uri, timed
@@ -71,20 +82,14 @@ async def process_video(
71
  images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
72
 
73
  callback("Querying")
74
- response = await client.chat.completions.create(
75
- model="gpt-4o",
76
- messages=[
77
  {
78
  "role": "user",
79
  "content": [
80
  {"type": "text", "text": transcription.text},
81
- *[
82
- {
83
- "type": "image_url",
84
- "image_url": {"url": image, "detail": "auto"},
85
- }
86
- for image in images
87
- ],
88
  ],
89
  },
90
  {
@@ -97,13 +102,25 @@ async def process_video(
97
  ],
98
  },
99
  ],
 
 
 
 
 
 
 
 
 
 
 
100
  )
 
101
 
102
  callback("Converting to speech")
103
  audio = await client.audio.speech.create(
104
  model="tts-1",
105
  voice="nova",
106
- input=response.choices[0].message.content,
107
  response_format="mp3",
108
  )
109
 
 
2
 
3
  import dotenv
4
  from openai import AsyncOpenAI
5
+ import requests
6
+ import torch
7
+ from PIL import Image
8
+ from io import BytesIO
9
+
10
+ from transformers import AutoProcessor, AutoModelForVision2Seq
11
+ from transformers.image_utils import load_image
12
+
13
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
15
+ model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b").to(DEVICE)
16
 
17
  from .input import decode_input
18
  from .utils import NamedTemporaryFile, file_to_data_uri, timed
 
82
  images = [file_to_data_uri(filename, "image/jpeg") for filename in input.images]
83
 
84
  callback("Querying")
85
+
86
+ prompt = processor.apply_chat_template(
87
+ [
88
  {
89
  "role": "user",
90
  "content": [
91
  {"type": "text", "text": transcription.text},
92
+ *[{"type": "image"} for image in images],
 
 
 
 
 
 
93
  ],
94
  },
95
  {
 
102
  ],
103
  },
104
  ],
105
+ add_generation_prompt=True,
106
+ )
107
+ inputs = processor(
108
+ text=prompt,
109
+ images=[load_image(image) for image in images],
110
+ return_tensors="pt",
111
+ )
112
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
113
+ generated_ids = model.generate(**inputs, max_new_tokens=500)
114
+ generated_texts = processor.batch_decode(
115
+ generated_ids, skip_special_tokens=True
116
  )
117
+ print("".join(generated_texts))
118
 
119
  callback("Converting to speech")
120
  audio = await client.audio.speech.create(
121
  model="tts-1",
122
  voice="nova",
123
+ input="".join(generated_texts),
124
  response_format="mp3",
125
  )
126