Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

App Files Files Community

D0k-tor commited on Jun 14, 2023

Commit

dcdb448

1 Parent(s): 0730b6e

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -50

app.py CHANGED Viewed

@@ -1,54 +1,8 @@
-# import gradio as gr
-# import streamlit as st
-# import torch
-# import re
-# from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
-# device='cpu'
-# encoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
-# decoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
-# model_checkpoint = "ydshieh/vit-gpt2-coco-eng"
-# feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
-# tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
-# model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
-# def predict(image,max_length=64, num_beams=4):
-#     input_image = Image.open(image)
-#     model.eval()
-#     pixel_values = feature_extractor(images=[input_image], return_tensors="pt").pixel_values
-#     with torch.no_grad():
-#         output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
-#     preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-#     preds = [pred.strip() for pred in preds]
-#     return preds[0]
-#   # image = image.convert('RGB')
-#   # image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
-#   # clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
-#   # caption_ids = model.generate(image, max_length = max_length)[0]
-#   # caption_text = clean_text(tokenizer.decode(caption_ids))
-#   # return caption_text
-# # st.title("Image to Text using Lora")
-# inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
-# output = gr.outputs.Textbox(type="text",label="Captions")
-# description = "NTT Data Bilbao team"
-# title = "Image to Text using Lora"
-# interface = gr.Interface(
-#         fn=predict,
-#         description=description,
-#         inputs = inputs,
-#         theme="grass",
-#         outputs=output,
-#         title=title,
-#     )
-# interface.launch(debug=True)
 import torch
 import re
 import gradio as gr
 from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
 import os
 import tensorflow as tf
@@ -75,13 +29,27 @@ def predict(image,max_length=64, num_beams=4):
   caption_text = clean_text(tokenizer.decode(caption_ids))
   return caption_text
 print("------------------------- 5 -------------------------\n")
 input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
 output = gr.outputs.Textbox(type="text",label="Captions")
 examples = ["example1.jpg"]
 print("------------------------- 6 -------------------------\n")
-title = "Image Captioning "
 description = "NTT Data"
 interface = gr.Interface(
@@ -91,6 +59,6 @@ interface = gr.Interface(
         theme="grass",
         outputs=output,
         examples = examples,
-        title=title,
     )
 interface.launch(debug=True)

 import torch
 import re
 import gradio as gr
+import streamlit as st
+# st.title("Image Caption Generator")
 from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
 import os
 import tensorflow as tf
   caption_text = clean_text(tokenizer.decode(caption_ids))
   return caption_text
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
+            Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit
+        </h1>
+        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
+        We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
+        Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
+        </h2>
+        </div>
+        """)
 print("------------------------- 5 -------------------------\n")
 input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
 output = gr.outputs.Textbox(type="text",label="Captions")
 examples = ["example1.jpg"]
 print("------------------------- 6 -------------------------\n")
+# title = "Image to - Text"
 description = "NTT Data"
 interface = gr.Interface(
         theme="grass",
         outputs=output,
         examples = examples,
+        # title=title,
     )
 interface.launch(debug=True)