Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,13 +12,10 @@ device='cpu'
|
|
12 |
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
13 |
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
14 |
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
15 |
-
|
16 |
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
|
17 |
-
print("------------------------- 2 -------------------------\n")
|
18 |
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
|
19 |
-
print("------------------------- 3 -------------------------\n")
|
20 |
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
|
21 |
-
print("------------------------- 4 -------------------------\n")
|
22 |
|
23 |
|
24 |
def predict(image, max_length=64, num_beams=4):
|
@@ -29,7 +26,6 @@ def predict(image, max_length=64, num_beams=4):
|
|
29 |
caption_text = clean_text(tokenizer.decode(caption_ids))
|
30 |
return caption_text
|
31 |
|
32 |
-
print("------------------------- 5 -------------------------\n")
|
33 |
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
|
34 |
output = gr.outputs.Textbox(type="text",label="Captions")
|
35 |
examples = ["example1.jpg"]
|
@@ -62,7 +58,7 @@ with gr.Blocks() as demo:
|
|
62 |
LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as <b>rank-decomposition matrices in each transformer block</b>. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.
|
63 |
<br>
|
64 |
<br>
|
65 |
-
You can find more info here: <a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank"
|
66 |
</h2>
|
67 |
|
68 |
</div>
|
|
|
12 |
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
13 |
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
14 |
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
|
15 |
+
|
16 |
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
|
|
|
17 |
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
|
|
|
18 |
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
|
|
|
19 |
|
20 |
|
21 |
def predict(image, max_length=64, num_beams=4):
|
|
|
26 |
caption_text = clean_text(tokenizer.decode(caption_ids))
|
27 |
return caption_text
|
28 |
|
|
|
29 |
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
|
30 |
output = gr.outputs.Textbox(type="text",label="Captions")
|
31 |
examples = ["example1.jpg"]
|
|
|
58 |
LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as <b>rank-decomposition matrices in each transformer block</b>. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.
|
59 |
<br>
|
60 |
<br>
|
61 |
+
You can find more info here: <a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank";>Linkedin article</a>
|
62 |
</h2>
|
63 |
|
64 |
</div>
|