CR7CAD commited on
Commit
7c5a1e4
·
verified ·
1 Parent(s): 1fb1e8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -116
app.py CHANGED
@@ -1,134 +1,95 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
  from PIL import Image
4
- import io
5
- from gtts import gTTS
6
- import time
7
  import os
8
- import traceback
9
 
10
- # Set page title
11
- st.set_page_config(page_title="Image to Audio Story Generator")
12
-
13
- # Title and introduction
14
- st.title("Image to Audio Story Generator")
15
- st.write("Upload a picture and let's create a magical story!")
16
-
17
- # Initialize models with better error handling
18
- @st.cache_resource
19
- def load_models():
20
  try:
21
- image_to_text = pipeline("image-to-text", model="microsoft/git-base-coco")
22
- story_generator = pipeline("text-generation", model="gpt2")
23
- return image_to_text, story_generator, None
 
 
 
 
 
 
24
  except Exception as e:
25
- return None, None, str(e)
26
-
27
- # Load models with status indicator
28
- with st.spinner("Loading models..."):
29
- image_to_text, story_generator, error = load_models()
30
- if error:
31
- st.error(f"Failed to load models: {error}")
32
- else:
33
- st.success("Models loaded successfully!")
34
 
35
- # Function to generate caption from image
36
- def generate_caption(image):
37
- try:
38
- result = image_to_text(image)
39
- if result and len(result) > 0:
40
- caption = result[0]['generated_text']
41
- return caption, None
42
- return "An interesting image", "No caption generated"
43
- except Exception as e:
44
- return "An interesting image", str(e)
45
 
46
- # Function to generate story from caption (less than 100 words)
47
- def generate_story(caption):
48
  try:
49
- prompt = f"Once upon a time, {caption} "
50
-
51
- # Debug output
52
- st.write(f"Prompt: {prompt}")
53
-
54
- # Generate with increased timeout and temperature
55
- result = story_generator(
56
- prompt,
57
- max_length=100,
58
- do_sample=True,
59
- temperature=0.9,
60
- top_p=0.95
61
- )
62
 
63
- # Debug output
64
- st.write(f"Generation result: {result}")
65
 
66
- if result and len(result) > 0:
67
- story = result[0]['generated_text']
68
-
69
- # Ensure story doesn't exceed 100 words
70
- words = story.split()
71
- if len(words) > 100:
72
- words = words[:100]
73
- story = " ".join(words)
74
- # Add period to the end if needed
75
- if not story.endswith(('.', '!', '?')):
76
- story += '.'
77
-
78
- return story, None
79
- return "Story generation failed.", "No story generated"
80
  except Exception as e:
81
- st.error(f"Error in story generation: {str(e)}")
82
- st.error(traceback.format_exc())
83
- return "Once upon a time... (Story generation failed)", str(e)
84
 
85
- # Function to convert text to speech
86
- def text_to_speech(text):
87
- try:
88
- tts = gTTS(text=text, lang='en', slow=False)
89
- audio_file = "story_audio.mp3"
90
- tts.save(audio_file)
91
- return audio_file, None
92
- except Exception as e:
93
- return None, str(e)
94
 
95
- # File uploader
96
- uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 
 
 
 
 
97
 
98
- if uploaded_file is not None and image_to_text is not None and story_generator is not None:
99
  # Display the uploaded image
100
- try:
101
- image = Image.open(uploaded_file)
102
- st.image(image, caption='Uploaded Image', use_container_width=True)
103
-
104
- # Generate button
105
- if st.button("Generate Story"):
106
- with st.spinner("Generating your story..."):
107
- # Generate caption
108
- caption, caption_error = generate_caption(image)
109
- if caption_error:
110
- st.warning(f"Caption generation issue: {caption_error}")
111
- st.write("Image caption:", caption)
112
-
113
- # Generate story
114
- story, story_error = generate_story(caption)
115
- if story_error:
116
- st.warning(f"Story generation issue: {story_error}")
117
- word_count = len(story.split())
118
- st.write(f"### Your Story ({word_count} words)")
119
- st.write(story)
120
-
121
- # Generate audio
122
- audio_file, audio_error = text_to_speech(story)
123
- if audio_error:
124
- st.warning(f"Audio generation issue: {audio_error}")
125
- else:
126
- # Display audio
127
- st.write("### Listen to your story")
128
- st.audio(audio_file)
129
- except Exception as e:
130
- st.error(f"Error processing image: {str(e)}")
131
- st.error(traceback.format_exc())
132
 
133
- st.markdown("---")
134
- st.write("Created for ISOM5240 Assignment 1")
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  from PIL import Image
 
 
 
4
  import os
 
5
 
6
+ # function part
7
+ # img2text
8
+ def img2text(image_path):
 
 
 
 
 
 
 
9
  try:
10
+ # Load the image-to-text model
11
+ image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
12
+ # Open the image file
13
+ image = Image.open(image_path)
14
+ # Extract text from the image
15
+ result = image_to_text_model(image)
16
+ # Get the generated text
17
+ text = result[0]["generated_text"] if result else "No text detected"
18
+ return text
19
  except Exception as e:
20
+ st.error(f"Error processing image: {str(e)}")
21
+ return f"Error: {str(e)}"
 
 
 
 
 
 
 
22
 
23
+ # text2story
24
+ def text2story(text):
25
+ # For now, just return the extracted text as the story
26
+ # This function can be expanded later with more sophisticated story generation
27
+ story_text = f"Here's a story based on the text: {text}"
28
+ return story_text
 
 
 
 
29
 
30
+ # text2audio
31
+ def text2audio(story_text):
32
  try:
33
+ # Load the text-to-speech model (using a common TTS pipeline)
34
+ # Note: You may need to install additional dependencies depending on the model used
35
+ tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits")
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Generate audio from the story text
38
+ audio_data = tts_model(story_text)
39
 
40
+ return audio_data
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
+ st.error(f"Error generating audio: {str(e)}")
43
+ return None
 
44
 
45
+ # main part
46
+ st.set_page_config(page_title="Your Image to Audio Story",
47
+ page_icon="🦜")
48
+ st.header("Turn Your Image to Audio Story")
49
+ st.subheader("Using Donut model for text extraction")
 
 
 
 
50
 
51
+ uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])
52
+
53
+ if uploaded_file is not None:
54
+ # Save the uploaded file temporarily
55
+ bytes_data = uploaded_file.getvalue()
56
+ with open(uploaded_file.name, "wb") as file:
57
+ file.write(bytes_data)
58
 
 
59
  # Display the uploaded image
60
+ st.image(uploaded_file, caption="Uploaded Image",
61
+ use_column_width=True)
62
+
63
+ # Stage 1: Image to Text
64
+ with st.spinner('Processing img2text...'):
65
+ extracted_text = img2text(uploaded_file.name)
66
+ st.subheader("Extracted Text:")
67
+ st.write(extracted_text)
68
+
69
+ # Stage 2: Text to Story
70
+ with st.spinner('Generating a story...'):
71
+ story = text2story(extracted_text)
72
+ st.subheader("Generated Story:")
73
+ st.write(story)
74
+
75
+ # Stage 3: Story to Audio data
76
+ with st.spinner('Generating audio data...'):
77
+ audio_data = text2audio(story)
78
+
79
+ # Remove the temporary file
80
+ if os.path.exists(uploaded_file.name):
81
+ os.remove(uploaded_file.name)
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Play button
84
+ if st.button("Play Audio"):
85
+ if audio_data:
86
+ st.audio(audio_data['audio'],
87
+ format="audio/wav",
88
+ start_time=0,
89
+ sample_rate=audio_data['sampling_rate'])
90
+ else:
91
+ st.warning("Audio generation failed. Playing a placeholder audio.")
92
+ try:
93
+ st.audio("kids_playing_audio.wav")
94
+ except FileNotFoundError:
95
+ st.error("Placeholder audio file not found. Audio playback is unavailable.")