Shriharshan commited on
Commit
5df03ae
·
1 Parent(s): 665b05a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -0
app.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Image captioning with ViT+GPT2"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "f:\\Image caption genrerator\\image_caption\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
+ " from .autonotebook import tqdm as notebook_tqdm\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "from PIL import Image\n",
26
+ "from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast\n",
27
+ "import requests"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "model = VisionEncoderDecoderModel.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 3,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "f:\\Image caption genrerator\\image_caption\\lib\\site-packages\\transformers\\models\\vit\\feature_extraction_vit.py:28: FutureWarning: The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use ViTImageProcessor instead.\n",
49
+ " warnings.warn(\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "vit_feature_extactor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224-in21k\")"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 4,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
67
+ "The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. \n",
68
+ "The class this function is called from is 'PreTrainedTokenizerFast'.\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "tokenizer = PreTrainedTokenizerFast.from_pretrained(\"distilgpt2\")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 5,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "#url = 'https://d2gp644kobdlm6.cloudfront.net/wp-content/uploads/2016/06/bigstock-Shocked-and-surprised-boy-on-t-113798588-300x212.jpg'"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 6,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "# with Image.open(requests.get(url, stream=True).raw) as img:\n",
92
+ "# pixel_values = vit_feature_extactor(images=img, return_tensors=\"pt\").pixel_values"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 7,
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "f:\\Image caption genrerator\\image_caption\\lib\\site-packages\\transformers\\generation\\utils.py:1346: UserWarning: Using `max_length`'s default (20) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
105
+ " warnings.warn(\n"
106
+ ]
107
+ }
108
+ ],
109
+ "source": [
110
+ "# encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams = 5)"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 8,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# generated_senetences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True,)"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 9,
125
+ "metadata": {},
126
+ "outputs": [
127
+ {
128
+ "data": {
129
+ "text/plain": [
130
+ "['a young boy sitting in front of a laptop computer ']"
131
+ ]
132
+ },
133
+ "execution_count": 9,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
+ "source": [
139
+ "# generated_senetences"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 11,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/plain": [
150
+ "'a young boy sitting in front of a laptop computer '"
151
+ ]
152
+ },
153
+ "execution_count": 11,
154
+ "metadata": {},
155
+ "output_type": "execute_result"
156
+ }
157
+ ],
158
+ "source": [
159
+ "# generated_senetences[0].split(\".\")[0]"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 13,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "def vit2distilgpt2(img):\n",
169
+ " pixel_values = vit_feature_extactor(images=img, return_tensors=\"pt\").pixel_values\n",
170
+ " encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)\n",
171
+ " generated_senetences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)\n",
172
+ "\n",
173
+ " return(generated_senetences[0].split('.')[0])"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 14,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "import gradio as gr"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 2,
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "ename": "NameError",
192
+ "evalue": "name 'gr' is not defined",
193
+ "output_type": "error",
194
+ "traceback": [
195
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
196
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
197
+ "Cell \u001b[1;32mIn[2], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m inputs \u001b[39m=\u001b[39m [\n\u001b[1;32m----> 2\u001b[0m gr\u001b[39m.\u001b[39minputs\u001b[39m.\u001b[39mImage(\u001b[39mtype\u001b[39m\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpil\u001b[39m\u001b[39m\"\u001b[39m,label\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mOriginal Images\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m ]\n\u001b[0;32m 5\u001b[0m outputs \u001b[39m=\u001b[39m [\n\u001b[0;32m 6\u001b[0m gr\u001b[39m.\u001b[39moutputs\u001b[39m.\u001b[39mTextbox(label \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mCaption\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 7\u001b[0m ]\n\u001b[0;32m 9\u001b[0m title \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mImage Captioning using ViT + GPT2\u001b[39m\u001b[39m\"\u001b[39m\n",
198
+ "\u001b[1;31mNameError\u001b[0m: name 'gr' is not defined"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "inputs = [\n",
204
+ " gr.inputs.Image(type=\"pil\",label=\"Original Images\")\n",
205
+ "]\n",
206
+ "\n",
207
+ "outputs = [\n",
208
+ " gr.outputs.Textbox(label = \"Caption\")\n",
209
+ "]\n",
210
+ "\n",
211
+ "title = \"Image Captioning using ViT + GPT2\"\n",
212
+ "description = \"ViT and GPT2 are used to generate Image Caption for the uploaded image.COCO DataSet is used for Training\"\n",
213
+ "examples = [\n",
214
+ " [\".Image1.png\"],\n",
215
+ " [\".Image2.png\"],\n",
216
+ " [\".Image3.png\"]\n",
217
+ "]\n",
218
+ "\n",
219
+ "\n",
220
+ "\n",
221
+ "\n",
222
+ "gr.Interface(\n",
223
+ " vit2distilgpt2,\n",
224
+ " inputs,\n",
225
+ " outputs,\n",
226
+ " title=title,\n",
227
+ " description=description,\n",
228
+ " examples=examples,\n",
229
+ " theme=\"huggingface\",\n",
230
+ ").launch(debug=True, enable_queue=True, share=True)"
231
+ ]
232
+ }
233
+ ],
234
+ "metadata": {
235
+ "kernelspec": {
236
+ "display_name": "Python 3 (ipykernel)",
237
+ "language": "python",
238
+ "name": "python3"
239
+ },
240
+ "language_info": {
241
+ "codemirror_mode": {
242
+ "name": "ipython",
243
+ "version": 3
244
+ },
245
+ "file_extension": ".py",
246
+ "mimetype": "text/x-python",
247
+ "name": "python",
248
+ "nbconvert_exporter": "python",
249
+ "pygments_lexer": "ipython3",
250
+ "version": "3.10.9"
251
+ }
252
+ },
253
+ "nbformat": 4,
254
+ "nbformat_minor": 2
255
+ }