added summarization endpoint, improved prompt
Browse files- app.py +15 -24
- constants.py +1 -6
- utils.py +17 -30
app.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
import tempfile
|
4 |
import uuid
|
5 |
import logging
|
6 |
-
from utils import
|
7 |
import constants
|
8 |
from groq import Groq
|
9 |
|
@@ -22,9 +22,10 @@ session_id = st.session_state.session_id
|
|
22 |
|
23 |
# Initialize state variables if not already set
|
24 |
state_variables = [
|
25 |
-
'transcript_visible', '
|
26 |
-
'audio', 'was_converted', 'transcript',
|
27 |
-
'generated_video', 'image_prompts', 'generated_images', 'video_generated'
|
|
|
28 |
]
|
29 |
|
30 |
for var in state_variables:
|
@@ -59,7 +60,6 @@ if audio_file:
|
|
59 |
st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
|
60 |
st.session_state[f'audio_{session_id}'] = audio_file
|
61 |
st.session_state[f'transcript_{session_id}'] = None
|
62 |
-
st.session_state[f'translation_{session_id}'] = None
|
63 |
st.session_state[f'image_prompts_{session_id}'] = None
|
64 |
st.session_state[f'generated_images_{session_id}'] = None # Reset image generation state
|
65 |
st.session_state[f'generated_video_{session_id}'] = None # Reset generated video state
|
@@ -86,12 +86,6 @@ if audio_file:
|
|
86 |
logger.error(f"Error during transcription: {e}")
|
87 |
st.error("An error occurred during transcription.")
|
88 |
|
89 |
-
# Translation logic
|
90 |
-
if st.session_state[f'transcript_{session_id}'] and st.session_state[f'translation_{session_id}'] is None:
|
91 |
-
with st.spinner("Generating translation... Please wait."):
|
92 |
-
st.session_state[f'translation_{session_id}'] = get_translation(st.session_state[f'transcript_{session_id}'])
|
93 |
-
logger.info("Translation generated successfully.")
|
94 |
-
|
95 |
st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
|
96 |
|
97 |
# Toggle transcript visibility
|
@@ -102,22 +96,19 @@ if audio_file:
|
|
102 |
st.write("### Transcription:")
|
103 |
st.write(st.session_state[f'transcript_{session_id}'])
|
104 |
|
105 |
-
#
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
st.write("### Translation:")
|
111 |
-
st.write(st.session_state[f'translation_{session_id}'])
|
112 |
|
113 |
# Image generation logic
|
114 |
-
if st.session_state[f'
|
115 |
with st.spinner("Generating image prompts... Please wait."):
|
116 |
-
|
117 |
-
st.
|
118 |
-
st.session_state[f'
|
119 |
-
|
120 |
-
st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(segments_to_chunks(st.session_state[f'segments_{session_id}']))['image_prompts']
|
121 |
logger.info("Image prompts generated successfully.")
|
122 |
|
123 |
# Ensure that generated_images is always a list
|
|
|
3 |
import tempfile
|
4 |
import uuid
|
5 |
import logging
|
6 |
+
from utils import get_summarization, get_image_prompts, segments_to_chunks, generate_images, generate_video
|
7 |
import constants
|
8 |
from groq import Groq
|
9 |
|
|
|
22 |
|
23 |
# Initialize state variables if not already set
|
24 |
state_variables = [
|
25 |
+
'transcript_visible', 'uploaded_file_name',
|
26 |
+
'audio', 'was_converted', 'transcript',
|
27 |
+
'generated_video', 'image_prompts', 'generated_images', 'video_generated',
|
28 |
+
'summary' # Added summary state variable
|
29 |
]
|
30 |
|
31 |
for var in state_variables:
|
|
|
60 |
st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
|
61 |
st.session_state[f'audio_{session_id}'] = audio_file
|
62 |
st.session_state[f'transcript_{session_id}'] = None
|
|
|
63 |
st.session_state[f'image_prompts_{session_id}'] = None
|
64 |
st.session_state[f'generated_images_{session_id}'] = None # Reset image generation state
|
65 |
st.session_state[f'generated_video_{session_id}'] = None # Reset generated video state
|
|
|
86 |
logger.error(f"Error during transcription: {e}")
|
87 |
st.error("An error occurred during transcription.")
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
|
90 |
|
91 |
# Toggle transcript visibility
|
|
|
96 |
st.write("### Transcription:")
|
97 |
st.write(st.session_state[f'transcript_{session_id}'])
|
98 |
|
99 |
+
# Summarization logic (not displayed on UI)
|
100 |
+
if st.session_state[f'transcript_{session_id}'] and st.session_state[f'summary_{session_id}'] is None:
|
101 |
+
with st.spinner("Generating summary... Please wait."):
|
102 |
+
st.session_state[f'summary_{session_id}'] = get_summarization(st.session_state[f'transcript_{session_id}'])
|
103 |
+
logger.info("Summary generated successfully.")
|
|
|
|
|
104 |
|
105 |
# Image generation logic
|
106 |
+
if st.session_state[f'transcript_{session_id}'] and st.session_state[f'image_prompts_{session_id}'] is None:
|
107 |
with st.spinner("Generating image prompts... Please wait."):
|
108 |
+
st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(
|
109 |
+
segments_to_chunks(st.session_state[f'segments_{session_id}']),
|
110 |
+
st.session_state[f'summary_{session_id}']
|
111 |
+
)['image_prompts']
|
|
|
112 |
logger.info("Image prompts generated successfully.")
|
113 |
|
114 |
# Ensure that generated_images is always a list
|
constants.py
CHANGED
@@ -4,13 +4,8 @@ import os
|
|
4 |
load_dotenv()
|
5 |
|
6 |
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
7 |
-
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
TRANSLATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
|
13 |
-
PROMPT_GENERATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/get-image-prompts"
|
14 |
IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
|
15 |
|
16 |
# Supported formats
|
|
|
4 |
load_dotenv()
|
5 |
|
6 |
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
|
|
7 |
|
8 |
+
SUMMARIZATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
|
|
|
|
|
|
|
|
|
9 |
IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
|
10 |
|
11 |
# Supported formats
|
utils.py
CHANGED
@@ -13,23 +13,8 @@ import tempfile
|
|
13 |
import os
|
14 |
|
15 |
|
16 |
-
def
|
17 |
-
print(
|
18 |
-
"""A temporary fix to the output of predict which returns output of openai-whisper-large-v3-turbo as string
|
19 |
-
but it outputs: AutomaticSpeechRecognitionOutput(text=" sometimes life <- like this the class name still remains
|
20 |
-
in the response, ideally which should have started from "sometimes..." as in the given example """
|
21 |
-
# Use find() to get the position of the start and end of the text
|
22 |
-
start_pos = result.find('text="') + len('text="') # Start after 'text="'
|
23 |
-
end_pos = result.find('", chunks=None') # End before '", chunks=None'
|
24 |
-
|
25 |
-
# Extract the text using slicing
|
26 |
-
cleaned_result = result[start_pos:end_pos]
|
27 |
-
print("Returning Cleaned Result: ", cleaned_result)
|
28 |
-
return cleaned_result
|
29 |
-
|
30 |
-
|
31 |
-
def get_translation(text: str):
|
32 |
-
print('\n\nTranslating text: ', text, type(text))
|
33 |
# Input payload
|
34 |
data = {"text_input": text}
|
35 |
|
@@ -38,14 +23,14 @@ def get_translation(text: str):
|
|
38 |
|
39 |
try:
|
40 |
# Make a GET request
|
41 |
-
response = requests.post(constants.
|
42 |
# Process response
|
43 |
if response.status_code == 200:
|
44 |
response_data = response.json()
|
45 |
-
print("Returning
|
46 |
return response_data.get("output", "No output found.")
|
47 |
else:
|
48 |
-
print("Some Error Occured During
|
49 |
print(response)
|
50 |
print(f"Error: {response.status_code}, {response.text}")
|
51 |
return {"error_occured" : response.text}
|
@@ -61,7 +46,8 @@ def segments_to_chunks(segments):
|
|
61 |
return chunks
|
62 |
|
63 |
|
64 |
-
def get_image_prompts(text_input : List):
|
|
|
65 |
# Example Pydantic model (e.g., Movie)
|
66 |
class ImagePromptResponseSchema(BaseModel):
|
67 |
image_prompts: List[str] = Field(
|
@@ -71,18 +57,19 @@ def get_image_prompts(text_input : List):
|
|
71 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
72 |
chunks_count = len(text_input)
|
73 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
74 |
-
prompt = f"""
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
3.
|
81 |
-
|
|
|
82 |
|
83 |
|
84 |
### Example
|
85 |
-
|
86 |
**Chunks**:
|
87 |
1. A guy went to the jungle.
|
88 |
2. He saw a lion.
|
@@ -96,7 +83,7 @@ SYSTEM PROMPT:
|
|
96 |
|
97 |
NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
|
98 |
|
99 |
-
TASK:
|
100 |
result = extractor.extract(prompt)
|
101 |
return result.model_dump() # returns dictionary version pydantic model
|
102 |
|
|
|
13 |
import os
|
14 |
|
15 |
|
16 |
+
def get_summarization(text: str):
|
17 |
+
print('\n\nSummarizing text: ', text, type(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Input payload
|
19 |
data = {"text_input": text}
|
20 |
|
|
|
23 |
|
24 |
try:
|
25 |
# Make a GET request
|
26 |
+
response = requests.post(constants.SUMMARIZATION_ENDPOINT, json=data, headers=headers)
|
27 |
# Process response
|
28 |
if response.status_code == 200:
|
29 |
response_data = response.json()
|
30 |
+
print("Returning Summarization")
|
31 |
return response_data.get("output", "No output found.")
|
32 |
else:
|
33 |
+
print("Some Error Occured During Summarization Request")
|
34 |
print(response)
|
35 |
print(f"Error: {response.status_code}, {response.text}")
|
36 |
return {"error_occured" : response.text}
|
|
|
46 |
return chunks
|
47 |
|
48 |
|
49 |
+
def get_image_prompts(text_input : List, summary):
|
50 |
+
print(f"summary: {summary}")
|
51 |
# Example Pydantic model (e.g., Movie)
|
52 |
class ImagePromptResponseSchema(BaseModel):
|
53 |
image_prompts: List[str] = Field(
|
|
|
57 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
58 |
chunks_count = len(text_input)
|
59 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
60 |
+
prompt = f"""
|
61 |
+
|
62 |
+
ROLE: You are a Highly Experienced Image Prompt Sythesizer
|
63 |
|
64 |
+
SYSTEM PROMPT: Given the Overall Summary and All Chunks of the Text
|
65 |
+
1. Use Summary and Combine all chunks to understand the complete context
|
66 |
+
3. **Identify the theme** and setting of the complete text
|
67 |
+
4. For each chunk, **generate a simple, context-aware image prompt** that fits the overall picture.
|
68 |
+
5. Keep Image Style as Hyper-Realistic (MUST BE FOLLOWED)
|
69 |
|
70 |
|
71 |
### Example
|
72 |
+
summary: this text is a story of guy who went to jungle and a lion
|
73 |
**Chunks**:
|
74 |
1. A guy went to the jungle.
|
75 |
2. He saw a lion.
|
|
|
83 |
|
84 |
NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
|
85 |
|
86 |
+
TASK: Here is the summary: {summary}\n\n and \n\n Total of {chunks_count} chunks, Generate an Image Prompt Each per chunk\n\n {chunks}"""
|
87 |
result = extractor.extract(prompt)
|
88 |
return result.model_dump() # returns dictionary version pydantic model
|
89 |
|