Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,54 +1,8 @@
|
|
1 |
-
# import gradio as gr
|
2 |
-
# import streamlit as st
|
3 |
-
# import torch
|
4 |
-
# import re
|
5 |
-
# from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
|
6 |
-
|
7 |
-
# device='cpu'
|
8 |
-
# encoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
|
9 |
-
# decoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
|
10 |
-
# model_checkpoint = "ydshieh/vit-gpt2-coco-eng"
|
11 |
-
# feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
|
12 |
-
# tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
|
13 |
-
# model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
|
14 |
-
|
15 |
-
# def predict(image,max_length=64, num_beams=4):
|
16 |
-
# input_image = Image.open(image)
|
17 |
-
# model.eval()
|
18 |
-
# pixel_values = feature_extractor(images=[input_image], return_tensors="pt").pixel_values
|
19 |
-
# with torch.no_grad():
|
20 |
-
# output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
|
21 |
-
# preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
22 |
-
# preds = [pred.strip() for pred in preds]
|
23 |
-
# return preds[0]
|
24 |
-
|
25 |
-
# # image = image.convert('RGB')
|
26 |
-
# # image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
|
27 |
-
# # clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
|
28 |
-
# # caption_ids = model.generate(image, max_length = max_length)[0]
|
29 |
-
# # caption_text = clean_text(tokenizer.decode(caption_ids))
|
30 |
-
# # return caption_text
|
31 |
-
|
32 |
-
# # st.title("Image to Text using Lora")
|
33 |
-
|
34 |
-
# inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
|
35 |
-
# output = gr.outputs.Textbox(type="text",label="Captions")
|
36 |
-
# description = "NTT Data Bilbao team"
|
37 |
-
# title = "Image to Text using Lora"
|
38 |
-
|
39 |
-
# interface = gr.Interface(
|
40 |
-
# fn=predict,
|
41 |
-
# description=description,
|
42 |
-
# inputs = inputs,
|
43 |
-
# theme="grass",
|
44 |
-
# outputs=output,
|
45 |
-
# title=title,
|
46 |
-
# )
|
47 |
-
# interface.launch(debug=True)
|
48 |
-
|
49 |
import torch
|
50 |
import re
|
51 |
import gradio as gr
|
|
|
|
|
52 |
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
|
53 |
import os
|
54 |
import tensorflow as tf
|
@@ -75,13 +29,27 @@ def predict(image,max_length=64, num_beams=4):
|
|
75 |
caption_text = clean_text(tokenizer.decode(caption_ids))
|
76 |
return caption_text
|
77 |
|
|
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
print("------------------------- 5 -------------------------\n")
|
80 |
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
|
81 |
output = gr.outputs.Textbox(type="text",label="Captions")
|
82 |
examples = ["example1.jpg"]
|
83 |
print("------------------------- 6 -------------------------\n")
|
84 |
-
title = "Image
|
85 |
description = "NTT Data"
|
86 |
interface = gr.Interface(
|
87 |
|
@@ -91,6 +59,6 @@ interface = gr.Interface(
|
|
91 |
theme="grass",
|
92 |
outputs=output,
|
93 |
examples = examples,
|
94 |
-
title=title,
|
95 |
)
|
96 |
interface.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import re
|
3 |
import gradio as gr
|
4 |
+
import streamlit as st
|
5 |
+
# st.title("Image Caption Generator")
|
6 |
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
|
7 |
import os
|
8 |
import tensorflow as tf
|
|
|
29 |
caption_text = clean_text(tokenizer.decode(caption_ids))
|
30 |
return caption_text
|
31 |
|
32 |
+
with gr.Blocks() as demo:
|
33 |
|
34 |
+
gr.HTML(
|
35 |
+
"""
|
36 |
+
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
|
37 |
+
<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
|
38 |
+
Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit
|
39 |
+
</h1>
|
40 |
+
<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
|
41 |
+
We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
|
42 |
+
Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
|
43 |
+
</h2>
|
44 |
+
</div>
|
45 |
+
""")
|
46 |
+
|
47 |
print("------------------------- 5 -------------------------\n")
|
48 |
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
|
49 |
output = gr.outputs.Textbox(type="text",label="Captions")
|
50 |
examples = ["example1.jpg"]
|
51 |
print("------------------------- 6 -------------------------\n")
|
52 |
+
# title = "Image to - Text"
|
53 |
description = "NTT Data"
|
54 |
interface = gr.Interface(
|
55 |
|
|
|
59 |
theme="grass",
|
60 |
outputs=output,
|
61 |
examples = examples,
|
62 |
+
# title=title,
|
63 |
)
|
64 |
interface.launch(debug=True)
|