Srujan111 commited on
Commit
02a464a
·
1 Parent(s): a396062

Delete app.ipynb

Browse files
Files changed (1) hide show
  1. app.ipynb +0 -204
app.ipynb DELETED
@@ -1,204 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 38,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "c:\\Users\\Srujan Jujare\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n"
14
- ]
15
- }
16
- ],
17
- "source": [
18
- "from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer\n",
19
- "import torch\n",
20
- "from PIL import Image"
21
- ]
22
- },
23
- {
24
- "cell_type": "code",
25
- "execution_count": 39,
26
- "metadata": {},
27
- "outputs": [
28
- {
29
- "data": {
30
- "text/plain": [
31
- "VisionEncoderDecoderModel(\n",
32
- " (encoder): ViTModel(\n",
33
- " (embeddings): ViTEmbeddings(\n",
34
- " (patch_embeddings): ViTPatchEmbeddings(\n",
35
- " (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))\n",
36
- " )\n",
37
- " (dropout): Dropout(p=0.0, inplace=False)\n",
38
- " )\n",
39
- " (encoder): ViTEncoder(\n",
40
- " (layer): ModuleList(\n",
41
- " (0-11): 12 x ViTLayer(\n",
42
- " (attention): ViTAttention(\n",
43
- " (attention): ViTSelfAttention(\n",
44
- " (query): Linear(in_features=768, out_features=768, bias=True)\n",
45
- " (key): Linear(in_features=768, out_features=768, bias=True)\n",
46
- " (value): Linear(in_features=768, out_features=768, bias=True)\n",
47
- " (dropout): Dropout(p=0.0, inplace=False)\n",
48
- " )\n",
49
- " (output): ViTSelfOutput(\n",
50
- " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
51
- " (dropout): Dropout(p=0.0, inplace=False)\n",
52
- " )\n",
53
- " )\n",
54
- " (intermediate): ViTIntermediate(\n",
55
- " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
56
- " (intermediate_act_fn): GELUActivation()\n",
57
- " )\n",
58
- " (output): ViTOutput(\n",
59
- " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
60
- " (dropout): Dropout(p=0.0, inplace=False)\n",
61
- " )\n",
62
- " (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
63
- " (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
64
- " )\n",
65
- " )\n",
66
- " )\n",
67
- " (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
68
- " (pooler): ViTPooler(\n",
69
- " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
70
- " (activation): Tanh()\n",
71
- " )\n",
72
- " )\n",
73
- " (decoder): GPT2LMHeadModel(\n",
74
- " (transformer): GPT2Model(\n",
75
- " (wte): Embedding(50257, 768)\n",
76
- " (wpe): Embedding(1024, 768)\n",
77
- " (drop): Dropout(p=0.1, inplace=False)\n",
78
- " (h): ModuleList(\n",
79
- " (0-11): 12 x GPT2Block(\n",
80
- " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
81
- " (attn): GPT2Attention(\n",
82
- " (c_attn): Conv1D()\n",
83
- " (c_proj): Conv1D()\n",
84
- " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
85
- " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
86
- " )\n",
87
- " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
88
- " (crossattention): GPT2Attention(\n",
89
- " (c_attn): Conv1D()\n",
90
- " (q_attn): Conv1D()\n",
91
- " (c_proj): Conv1D()\n",
92
- " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
93
- " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
94
- " )\n",
95
- " (ln_cross_attn): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
96
- " (mlp): GPT2MLP(\n",
97
- " (c_fc): Conv1D()\n",
98
- " (c_proj): Conv1D()\n",
99
- " (act): NewGELUActivation()\n",
100
- " (dropout): Dropout(p=0.1, inplace=False)\n",
101
- " )\n",
102
- " )\n",
103
- " )\n",
104
- " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
105
- " )\n",
106
- " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
107
- " )\n",
108
- ")"
109
- ]
110
- },
111
- "execution_count": 39,
112
- "metadata": {},
113
- "output_type": "execute_result"
114
- }
115
- ],
116
- "source": [
117
- "model = VisionEncoderDecoderModel.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n",
118
- "feature_extractor = ViTImageProcessor.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n",
119
- "tokenizer = AutoTokenizer.from_pretrained(\"nlpconnect/vit-gpt2-image-captioning\")\n",
120
- "\n",
121
- "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
122
- "model.to(device)\n"
123
- ]
124
- },
125
- {
126
- "cell_type": "code",
127
- "execution_count": 40,
128
- "metadata": {},
129
- "outputs": [],
130
- "source": [
131
- "max_length = 16\n",
132
- "num_beams = 4\n",
133
- "gen_kwargs = {\"max_length\": max_length, \"num_beams\": num_beams}\n",
134
- "def predict_step(image_paths):\n",
135
- " images = []\n",
136
- " for image_path in image_paths:\n",
137
- " i_image = Image.open(image_path)\n",
138
- " if i_image.mode != \"RGB\":\n",
139
- " i_image = i_image.convert(mode=\"RGB\")\n",
140
- "\n",
141
- " images.append(i_image)\n",
142
- "\n",
143
- " pixel_values = feature_extractor(images=images, return_tensors=\"pt\").pixel_values\n",
144
- " pixel_values = pixel_values.to(device)\n",
145
- "\n",
146
- " output_ids = model.generate(pixel_values, **gen_kwargs)\n",
147
- "\n",
148
- " preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)\n",
149
- " preds = [pred.strip() for pred in preds]\n",
150
- " return preds"
151
- ]
152
- },
153
- {
154
- "cell_type": "code",
155
- "execution_count": 41,
156
- "metadata": {},
157
- "outputs": [
158
- {
159
- "name": "stderr",
160
- "output_type": "stream",
161
- "text": [
162
- "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n",
163
- "You may ignore this warning if your `pad_token_id` (50256) is identical to the `bos_token_id` (50256), `eos_token_id` (50256), or the `sep_token_id` (None), and your input is not padded.\n"
164
- ]
165
- },
166
- {
167
- "data": {
168
- "text/plain": [
169
- "['a clock on a dashboard of a car']"
170
- ]
171
- },
172
- "execution_count": 41,
173
- "metadata": {},
174
- "output_type": "execute_result"
175
- }
176
- ],
177
- "source": [
178
- "predict_step(['D:\\\\Validation\\\\Class 2\\\\i17.jpg'])"
179
- ]
180
- }
181
- ],
182
- "metadata": {
183
- "kernelspec": {
184
- "display_name": "Python 3",
185
- "language": "python",
186
- "name": "python3"
187
- },
188
- "language_info": {
189
- "codemirror_mode": {
190
- "name": "ipython",
191
- "version": 3
192
- },
193
- "file_extension": ".py",
194
- "mimetype": "text/x-python",
195
- "name": "python",
196
- "nbconvert_exporter": "python",
197
- "pygments_lexer": "ipython3",
198
- "version": "3.11.5"
199
- },
200
- "orig_nbformat": 4
201
- },
202
- "nbformat": 4,
203
- "nbformat_minor": 2
204
- }