Ngaima Sandiman
commited on
Commit
·
9cc3964
1
Parent(s):
749932e
Changed transformer version to fix issues.
Browse files- requirements.txt +5 -4
- src/model/modules/imagecraft.py +3 -8
- src/model/modules/imagecraftprocessor.py +0 -6
- src/utils/model_utils.py +1 -36
requirements.txt
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
-e git+https://github.com/facebookresearch/audiocraft.git@f83babff6b5e97f75562127c4cc8122229c8f099#egg=audiocraft
|
2 |
phonemizer
|
|
|
|
|
|
|
|
|
|
|
3 |
spaces
|
4 |
huggingface-hub
|
5 |
num2words
|
6 |
-
transformers
|
7 |
numpy
|
8 |
pillow
|
9 |
safetensors
|
10 |
tokenizers
|
11 |
-
torch==2.1.0
|
12 |
-
torchaudio
|
13 |
-
torchvision
|
14 |
aeneas
|
|
|
1 |
-e git+https://github.com/facebookresearch/audiocraft.git@f83babff6b5e97f75562127c4cc8122229c8f099#egg=audiocraft
|
2 |
phonemizer
|
3 |
+
transformers==4.43.1
|
4 |
+
torch==2.1.1
|
5 |
+
numpy==2.0.1
|
6 |
+
torchaudio
|
7 |
+
torchvision
|
8 |
spaces
|
9 |
huggingface-hub
|
10 |
num2words
|
|
|
11 |
numpy
|
12 |
pillow
|
13 |
safetensors
|
14 |
tokenizers
|
|
|
|
|
|
|
15 |
aeneas
|
src/model/modules/imagecraft.py
CHANGED
@@ -405,15 +405,10 @@ class ImageCraft(nn.Module):
|
|
405 |
max_tokens=30,
|
406 |
do_sample=False,
|
407 |
output_type="file",
|
408 |
-
return_output="speech",
|
409 |
):
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
return transcript, speech
|
414 |
-
else:
|
415 |
-
transcript = self._generate_caption(image, max_tokens, do_sample)
|
416 |
-
return transcript
|
417 |
|
418 |
@classmethod
|
419 |
def from_pretrained(
|
|
|
405 |
max_tokens=30,
|
406 |
do_sample=False,
|
407 |
output_type="file",
|
|
|
408 |
):
|
409 |
+
transcript = self._generate_caption(image, max_tokens, do_sample)
|
410 |
+
speech = self._generate_speech(transcript, output_type)
|
411 |
+
return transcript, speech
|
|
|
|
|
|
|
|
|
412 |
|
413 |
@classmethod
|
414 |
def from_pretrained(
|
src/model/modules/imagecraftprocessor.py
CHANGED
@@ -40,9 +40,6 @@ class ImageCraftProcessor:
|
|
40 |
tokenizer.add_eos_token = False
|
41 |
|
42 |
self.tokenizer = tokenizer
|
43 |
-
# self.image_processor = SiglipImageProcessor.from_pretrained(
|
44 |
-
# "google/siglip-so400m-patch14-384"
|
45 |
-
# )
|
46 |
|
47 |
def __call__(
|
48 |
self,
|
@@ -55,9 +52,6 @@ class ImageCraftProcessor:
|
|
55 |
len(images) == 1 and len(text) == 1
|
56 |
), f"Received {len(images)} images for {len(text)} prompts."
|
57 |
|
58 |
-
# pixel_values = self.image_processor(images=images, return_tensors="pt")[
|
59 |
-
# "pixel_values"
|
60 |
-
# ]
|
61 |
pixel_values = process_images(
|
62 |
images,
|
63 |
size=(self.image_size, self.image_size),
|
|
|
40 |
tokenizer.add_eos_token = False
|
41 |
|
42 |
self.tokenizer = tokenizer
|
|
|
|
|
|
|
43 |
|
44 |
def __call__(
|
45 |
self,
|
|
|
52 |
len(images) == 1 and len(text) == 1
|
53 |
), f"Received {len(images)} images for {len(text)} prompts."
|
54 |
|
|
|
|
|
|
|
55 |
pixel_values = process_images(
|
56 |
images,
|
57 |
size=(self.image_size, self.image_size),
|
src/utils/model_utils.py
CHANGED
@@ -19,13 +19,11 @@ def get_model_inputs(
|
|
19 |
processor: ImageCraftProcessor,
|
20 |
prompt: str,
|
21 |
image: Image,
|
22 |
-
suffix: Optional[str] = None,
|
23 |
device: str = "cuda",
|
24 |
):
|
25 |
images = [image]
|
26 |
prompts = [prompt]
|
27 |
-
|
28 |
-
suffix = [suffix]
|
29 |
model_inputs = processor(text=prompts, images=images)
|
30 |
model_inputs = move_inputs_to_device(model_inputs, device)
|
31 |
return model_inputs
|
@@ -38,36 +36,3 @@ def get_config(config_file="config.json"):
|
|
38 |
config = ImageCraftConfig(**model_config_file)
|
39 |
|
40 |
return config
|
41 |
-
|
42 |
-
|
43 |
-
# def load_hf_model(model_path: str, device: str) -> Tuple[ImageCraft, AutoTokenizer]:
|
44 |
-
|
45 |
-
# # Load the tokenizer
|
46 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
|
47 |
-
# assert tokenizer.padding_side == "right"
|
48 |
-
|
49 |
-
# # Find all the *.safetensors files
|
50 |
-
# safetensors_files = glob.glob(os.path.join(model_path, "*.safetensors"))
|
51 |
-
|
52 |
-
# # ... and load them one by one in the tensors dictionary
|
53 |
-
# tensors = {}
|
54 |
-
# for safetensors_file in safetensors_files:
|
55 |
-
# with safe_open(safetensors_file, framework="pt", device="cpu") as f:
|
56 |
-
# for key in f.keys():
|
57 |
-
# tensors[key] = f.get_tensor(key)
|
58 |
-
|
59 |
-
# # Load the model's config
|
60 |
-
# with open(os.path.join(model_path, "config.json"), "r") as f:
|
61 |
-
# model_config_file = json.load(f)
|
62 |
-
# config = ImageCraftConfig(**model_config_file)
|
63 |
-
|
64 |
-
# # Create the model using the configuration
|
65 |
-
# model = ImageCraft(config).to(device)
|
66 |
-
|
67 |
-
# # Load the state dict of the model
|
68 |
-
# model.load_state_dict(tensors, strict=False)
|
69 |
-
|
70 |
-
# # Tie weights
|
71 |
-
# model.tie_weights()
|
72 |
-
|
73 |
-
# return (model, tokenizer)
|
|
|
19 |
processor: ImageCraftProcessor,
|
20 |
prompt: str,
|
21 |
image: Image,
|
|
|
22 |
device: str = "cuda",
|
23 |
):
|
24 |
images = [image]
|
25 |
prompts = [prompt]
|
26 |
+
|
|
|
27 |
model_inputs = processor(text=prompts, images=images)
|
28 |
model_inputs = move_inputs_to_device(model_inputs, device)
|
29 |
return model_inputs
|
|
|
36 |
config = ImageCraftConfig(**model_config_file)
|
37 |
|
38 |
return config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|