Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +49 -0
- requirements.txt +64 -0
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
import torch
|
4 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast
|
5 |
+
from gtts import gTTS
|
6 |
+
|
7 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
model = VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning').to(device)
|
14 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
15 |
+
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
16 |
+
|
17 |
+
|
18 |
+
def get_caption(model,image_processor,tokenizer,image_path):
|
19 |
+
image = Image.open(image_path)
|
20 |
+
|
21 |
+
#processing the image
|
22 |
+
img = image_processor(image,return_tensors='pt').to(device)
|
23 |
+
|
24 |
+
# gteneratimg caption
|
25 |
+
output = model.generate(**img)
|
26 |
+
|
27 |
+
# decode the output
|
28 |
+
caption = tokenizer.batch_decode(output,skip_special_tokens=True)[0]
|
29 |
+
|
30 |
+
return caption
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
st.title('Vision Transformers (ViT) in Image Captioning Using Pretrained ViT Models')
|
36 |
+
|
37 |
+
|
38 |
+
uploaded_image = st.file_uploader('Upload an Image',type=['png','jpg','jpeg'])
|
39 |
+
|
40 |
+
if uploaded_image is not None:
|
41 |
+
# image = Image.open(uploaded_image)
|
42 |
+
st.image(uploaded_image)
|
43 |
+
caption = get_caption(model,image_processor,tokenizer,uploaded_image)
|
44 |
+
st.header(caption)
|
45 |
+
read_caption = gTTS(caption,lang='en',slow=True)
|
46 |
+
read_caption.save('caption.mp3')
|
47 |
+
st.audio('caption.mp3',autoplay=True)
|
48 |
+
else:
|
49 |
+
st.error('No Image Uploaded !')
|
requirements.txt
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
attrs==23.2.0
|
3 |
+
blinker==1.8.2
|
4 |
+
cachetools==5.4.0
|
5 |
+
certifi==2024.7.4
|
6 |
+
charset-normalizer==3.3.2
|
7 |
+
click==8.1.7
|
8 |
+
colorama==0.4.6
|
9 |
+
comtypes==1.4.5
|
10 |
+
filelock==3.15.4
|
11 |
+
fsspec==2024.6.1
|
12 |
+
gitdb==4.0.11
|
13 |
+
GitPython==3.1.43
|
14 |
+
gTTS==2.5.1
|
15 |
+
huggingface-hub==0.23.5
|
16 |
+
idna==3.7
|
17 |
+
intel-openmp==2021.4.0
|
18 |
+
Jinja2==3.1.4
|
19 |
+
jsonschema==4.23.0
|
20 |
+
jsonschema-specifications==2023.12.1
|
21 |
+
markdown-it-py==3.0.0
|
22 |
+
MarkupSafe==2.1.5
|
23 |
+
mdurl==0.1.2
|
24 |
+
mkl==2021.4.0
|
25 |
+
mpmath==1.3.0
|
26 |
+
networkx==3.3
|
27 |
+
numpy==1.26.4
|
28 |
+
packaging==24.1
|
29 |
+
pandas==2.2.2
|
30 |
+
pillow==10.4.0
|
31 |
+
protobuf==5.27.2
|
32 |
+
pyarrow==17.0.0
|
33 |
+
pydeck==0.9.1
|
34 |
+
Pygments==2.18.0
|
35 |
+
pypiwin32==223
|
36 |
+
python-dateutil==2.9.0.post0
|
37 |
+
pytz==2024.1
|
38 |
+
pywin32==306
|
39 |
+
PyYAML==6.0.1
|
40 |
+
referencing==0.35.1
|
41 |
+
regex==2024.5.15
|
42 |
+
requests==2.32.3
|
43 |
+
rich==13.7.1
|
44 |
+
rpds-py==0.19.0
|
45 |
+
safetensors==0.4.3
|
46 |
+
six==1.16.0
|
47 |
+
smmap==5.0.1
|
48 |
+
streamlit==1.36.0
|
49 |
+
sympy==1.13.0
|
50 |
+
tbb==2021.13.0
|
51 |
+
tenacity==8.5.0
|
52 |
+
tokenizers==0.19.1
|
53 |
+
toml==0.10.2
|
54 |
+
toolz==0.12.1
|
55 |
+
torch==2.2.2+cpu
|
56 |
+
torchaudio==2.2.2+cpu
|
57 |
+
torchvision==0.17.2+cpu
|
58 |
+
tornado==6.4.1
|
59 |
+
tqdm==4.66.4
|
60 |
+
transformers==4.42.4
|
61 |
+
typing_extensions==4.12.2
|
62 |
+
tzdata==2024.1
|
63 |
+
urllib3==2.2.2
|
64 |
+
watchdog==4.0.1
|