D0k-tor commited on
Commit
2d2629a
·
1 Parent(s): 9a282a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -6
app.py CHANGED
@@ -12,13 +12,10 @@ device='cpu'
12
  encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
13
  decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
14
  model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
15
- print("------------------------- 1 -------------------------\n")
16
  feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
17
- print("------------------------- 2 -------------------------\n")
18
  tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
19
- print("------------------------- 3 -------------------------\n")
20
  model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
21
- print("------------------------- 4 -------------------------\n")
22
 
23
 
24
  def predict(image, max_length=64, num_beams=4):
@@ -29,7 +26,6 @@ def predict(image, max_length=64, num_beams=4):
29
  caption_text = clean_text(tokenizer.decode(caption_ids))
30
  return caption_text
31
 
32
- print("------------------------- 5 -------------------------\n")
33
  input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
34
  output = gr.outputs.Textbox(type="text",label="Captions")
35
  examples = ["example1.jpg"]
@@ -62,7 +58,7 @@ with gr.Blocks() as demo:
62
  LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as <b>rank-decomposition matrices in each transformer block</b>. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.
63
  <br>
64
  <br>
65
- You can find more info here: <a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank" style="text-decoration: underline;>Linkedin article</a>
66
  </h2>
67
 
68
  </div>
 
12
  encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
13
  decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
14
  model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
15
+
16
  feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
 
17
  tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
 
18
  model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
 
19
 
20
 
21
  def predict(image, max_length=64, num_beams=4):
 
26
  caption_text = clean_text(tokenizer.decode(caption_ids))
27
  return caption_text
28
 
 
29
  input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
30
  output = gr.outputs.Textbox(type="text",label="Captions")
31
  examples = ["example1.jpg"]
 
58
  LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as <b>rank-decomposition matrices in each transformer block</b>. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.
59
  <br>
60
  <br>
61
+ You can find more info here: <a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank";>Linkedin article</a>
62
  </h2>
63
 
64
  </div>