Raj086 commited on
Commit
fe52fb1
·
verified ·
1 Parent(s): 48c0cd4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +49 -0
  2. requirements.txt +64 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import torch
4
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast
5
+ from gtts import gTTS
6
+
7
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
8
+
9
+
10
+
11
+
12
+
13
+ model = VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning').to(device)
14
+ tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
15
+ image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
16
+
17
+
18
+ def get_caption(model,image_processor,tokenizer,image_path):
19
+ image = Image.open(image_path)
20
+
21
+ #processing the image
22
+ img = image_processor(image,return_tensors='pt').to(device)
23
+
24
+ # gteneratimg caption
25
+ output = model.generate(**img)
26
+
27
+ # decode the output
28
+ caption = tokenizer.batch_decode(output,skip_special_tokens=True)[0]
29
+
30
+ return caption
31
+
32
+
33
+
34
+
35
+ st.title('Vision Transformers (ViT) in Image Captioning Using Pretrained ViT Models')
36
+
37
+
38
+ uploaded_image = st.file_uploader('Upload an Image',type=['png','jpg','jpeg'])
39
+
40
+ if uploaded_image is not None:
41
+ # image = Image.open(uploaded_image)
42
+ st.image(uploaded_image)
43
+ caption = get_caption(model,image_processor,tokenizer,uploaded_image)
44
+ st.header(caption)
45
+ read_caption = gTTS(caption,lang='en',slow=True)
46
+ read_caption.save('caption.mp3')
47
+ st.audio('caption.mp3',autoplay=True)
48
+ else:
49
+ st.error('No Image Uploaded !')
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ blinker==1.8.2
4
+ cachetools==5.4.0
5
+ certifi==2024.7.4
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ colorama==0.4.6
9
+ comtypes==1.4.5
10
+ filelock==3.15.4
11
+ fsspec==2024.6.1
12
+ gitdb==4.0.11
13
+ GitPython==3.1.43
14
+ gTTS==2.5.1
15
+ huggingface-hub==0.23.5
16
+ idna==3.7
17
+ intel-openmp==2021.4.0
18
+ Jinja2==3.1.4
19
+ jsonschema==4.23.0
20
+ jsonschema-specifications==2023.12.1
21
+ markdown-it-py==3.0.0
22
+ MarkupSafe==2.1.5
23
+ mdurl==0.1.2
24
+ mkl==2021.4.0
25
+ mpmath==1.3.0
26
+ networkx==3.3
27
+ numpy==1.26.4
28
+ packaging==24.1
29
+ pandas==2.2.2
30
+ pillow==10.4.0
31
+ protobuf==5.27.2
32
+ pyarrow==17.0.0
33
+ pydeck==0.9.1
34
+ Pygments==2.18.0
35
+ pypiwin32==223
36
+ python-dateutil==2.9.0.post0
37
+ pytz==2024.1
38
+ pywin32==306
39
+ PyYAML==6.0.1
40
+ referencing==0.35.1
41
+ regex==2024.5.15
42
+ requests==2.32.3
43
+ rich==13.7.1
44
+ rpds-py==0.19.0
45
+ safetensors==0.4.3
46
+ six==1.16.0
47
+ smmap==5.0.1
48
+ streamlit==1.36.0
49
+ sympy==1.13.0
50
+ tbb==2021.13.0
51
+ tenacity==8.5.0
52
+ tokenizers==0.19.1
53
+ toml==0.10.2
54
+ toolz==0.12.1
55
+ torch==2.2.2+cpu
56
+ torchaudio==2.2.2+cpu
57
+ torchvision==0.17.2+cpu
58
+ tornado==6.4.1
59
+ tqdm==4.66.4
60
+ transformers==4.42.4
61
+ typing_extensions==4.12.2
62
+ tzdata==2024.1
63
+ urllib3==2.2.2
64
+ watchdog==4.0.1