D0k-tor commited on
Commit
dcdb448
·
1 Parent(s): 0730b6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -50
app.py CHANGED
@@ -1,54 +1,8 @@
1
- # import gradio as gr
2
- # import streamlit as st
3
- # import torch
4
- # import re
5
- # from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
6
-
7
- # device='cpu'
8
- # encoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
9
- # decoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
10
- # model_checkpoint = "ydshieh/vit-gpt2-coco-eng"
11
- # feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
12
- # tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
13
- # model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
14
-
15
- # def predict(image,max_length=64, num_beams=4):
16
- # input_image = Image.open(image)
17
- # model.eval()
18
- # pixel_values = feature_extractor(images=[input_image], return_tensors="pt").pixel_values
19
- # with torch.no_grad():
20
- # output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
21
- # preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
22
- # preds = [pred.strip() for pred in preds]
23
- # return preds[0]
24
-
25
- # # image = image.convert('RGB')
26
- # # image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
27
- # # clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
28
- # # caption_ids = model.generate(image, max_length = max_length)[0]
29
- # # caption_text = clean_text(tokenizer.decode(caption_ids))
30
- # # return caption_text
31
-
32
- # # st.title("Image to Text using Lora")
33
-
34
- # inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
35
- # output = gr.outputs.Textbox(type="text",label="Captions")
36
- # description = "NTT Data Bilbao team"
37
- # title = "Image to Text using Lora"
38
-
39
- # interface = gr.Interface(
40
- # fn=predict,
41
- # description=description,
42
- # inputs = inputs,
43
- # theme="grass",
44
- # outputs=output,
45
- # title=title,
46
- # )
47
- # interface.launch(debug=True)
48
-
49
  import torch
50
  import re
51
  import gradio as gr
 
 
52
  from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
53
  import os
54
  import tensorflow as tf
@@ -75,13 +29,27 @@ def predict(image,max_length=64, num_beams=4):
75
  caption_text = clean_text(tokenizer.decode(caption_ids))
76
  return caption_text
77
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  print("------------------------- 5 -------------------------\n")
80
  input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
81
  output = gr.outputs.Textbox(type="text",label="Captions")
82
  examples = ["example1.jpg"]
83
  print("------------------------- 6 -------------------------\n")
84
- title = "Image Captioning "
85
  description = "NTT Data"
86
  interface = gr.Interface(
87
 
@@ -91,6 +59,6 @@ interface = gr.Interface(
91
  theme="grass",
92
  outputs=output,
93
  examples = examples,
94
- title=title,
95
  )
96
  interface.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import re
3
  import gradio as gr
4
+ import streamlit as st
5
+ # st.title("Image Caption Generator")
6
  from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
7
  import os
8
  import tensorflow as tf
 
29
  caption_text = clean_text(tokenizer.decode(caption_ids))
30
  return caption_text
31
 
32
+ with gr.Blocks() as demo:
33
 
34
+ gr.HTML(
35
+ """
36
+ <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
37
+ <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
38
+ Image-to-Text with [Lora](https://huggingface.co/blog/lora) and Vit
39
+ </h1>
40
+ <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
41
+ We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
42
+ Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
43
+ </h2>
44
+ </div>
45
+ """)
46
+
47
  print("------------------------- 5 -------------------------\n")
48
  input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
49
  output = gr.outputs.Textbox(type="text",label="Captions")
50
  examples = ["example1.jpg"]
51
  print("------------------------- 6 -------------------------\n")
52
+ # title = "Image to - Text"
53
  description = "NTT Data"
54
  interface = gr.Interface(
55
 
 
59
  theme="grass",
60
  outputs=output,
61
  examples = examples,
62
+ # title=title,
63
  )
64
  interface.launch(debug=True)