audio-to-video-generator

Running

App Files Files Community

wower99 commited on Jan 26

Commit

4e4c3a4

1 Parent(s): 745e3b9

added summarization endpoint, improved prompt

Browse files

Files changed (3) hide show

app.py +15 -24
constants.py +1 -6
utils.py +17 -30

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import tempfile
 import uuid
 import logging
-from utils import get_translation, get_image_prompts, segments_to_chunks, generate_images, generate_video
 import constants
 from groq import Groq
@@ -22,9 +22,10 @@ session_id = st.session_state.session_id
 # Initialize state variables if not already set
 state_variables = [
-    'transcript_visible', 'translation_visible', 'uploaded_file_name',
-    'audio', 'was_converted', 'transcript', 'translation',
-    'generated_video', 'image_prompts', 'generated_images', 'video_generated'
 ]
 for var in state_variables:
@@ -59,7 +60,6 @@ if audio_file:
         st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
         st.session_state[f'audio_{session_id}'] = audio_file
         st.session_state[f'transcript_{session_id}'] = None
-        st.session_state[f'translation_{session_id}'] = None
         st.session_state[f'image_prompts_{session_id}'] = None
         st.session_state[f'generated_images_{session_id}'] = None  # Reset image generation state
         st.session_state[f'generated_video_{session_id}'] = None  # Reset generated video state
@@ -86,12 +86,6 @@ if audio_file:
         logger.error(f"Error during transcription: {e}")
         st.error("An error occurred during transcription.")
-    # Translation logic
-    if st.session_state[f'transcript_{session_id}'] and st.session_state[f'translation_{session_id}'] is None:
-        with st.spinner("Generating translation... Please wait."):
-            st.session_state[f'translation_{session_id}'] = get_translation(st.session_state[f'transcript_{session_id}'])
-            logger.info("Translation generated successfully.")
     st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
     # Toggle transcript visibility
@@ -102,22 +96,19 @@ if audio_file:
         st.write("### Transcription:")
         st.write(st.session_state[f'transcript_{session_id}'])
-    # Toggle translation visibility
-    toggle_translation = st.checkbox("Show Translation", value=st.session_state[f'translation_visible_{session_id}'], key="toggle_translation")
-    st.session_state[f'translation_visible_{session_id}'] = toggle_translation
-    if st.session_state[f'translation_visible_{session_id}']:
-        st.write("### Translation:")
-        st.write(st.session_state[f'translation_{session_id}'])
     # Image generation logic
-    if st.session_state[f'translation_{session_id}'] and st.session_state[f'image_prompts_{session_id}'] is None:
         with st.spinner("Generating image prompts... Please wait."):
-            if 'Already in English' in st.session_state[f'translation_{session_id}']:
-                st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
-                st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(segments_to_chunks(st.session_state[f'segments_{session_id}']))['image_prompts']
-            else:
-                st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(segments_to_chunks(st.session_state[f'segments_{session_id}']))['image_prompts']
             logger.info("Image prompts generated successfully.")
     # Ensure that generated_images is always a list

 import tempfile
 import uuid
 import logging
+from utils import get_summarization, get_image_prompts, segments_to_chunks, generate_images, generate_video
 import constants
 from groq import Groq
 # Initialize state variables if not already set
 state_variables = [
+    'transcript_visible', 'uploaded_file_name',
+    'audio', 'was_converted', 'transcript',
+    'generated_video', 'image_prompts', 'generated_images', 'video_generated',
+    'summary'  # Added summary state variable
 ]
 for var in state_variables:
         st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
         st.session_state[f'audio_{session_id}'] = audio_file
         st.session_state[f'transcript_{session_id}'] = None
         st.session_state[f'image_prompts_{session_id}'] = None
         st.session_state[f'generated_images_{session_id}'] = None  # Reset image generation state
         st.session_state[f'generated_video_{session_id}'] = None  # Reset generated video state
         logger.error(f"Error during transcription: {e}")
         st.error("An error occurred during transcription.")
     st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
     # Toggle transcript visibility
         st.write("### Transcription:")
         st.write(st.session_state[f'transcript_{session_id}'])
+    # Summarization logic (not displayed on UI)
+    if st.session_state[f'transcript_{session_id}'] and st.session_state[f'summary_{session_id}'] is None:
+        with st.spinner("Generating summary... Please wait."):
+            st.session_state[f'summary_{session_id}'] = get_summarization(st.session_state[f'transcript_{session_id}'])
+            logger.info("Summary generated successfully.")
     # Image generation logic
+    if st.session_state[f'transcript_{session_id}'] and st.session_state[f'image_prompts_{session_id}'] is None:
         with st.spinner("Generating image prompts... Please wait."):
+            st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(
+                segments_to_chunks(st.session_state[f'segments_{session_id}']),
+                st.session_state[f'summary_{session_id}']
+            )['image_prompts']
             logger.info("Image prompts generated successfully.")
     # Ensure that generated_images is always a list

constants.py CHANGED Viewed

@@ -4,13 +4,8 @@ import os
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN", None)
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
-TRANSLATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
-PROMPT_GENERATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/get-image-prompts"
 IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
 # Supported formats

 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN", None)
+SUMMARIZATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
 IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
 # Supported formats

utils.py CHANGED Viewed

@@ -13,23 +13,8 @@ import tempfile
 import os
-def clean_response(result):
-    print("\n\nStarted Cleaning Response")
-    """A temporary fix to the output of predict which returns output of openai-whisper-large-v3-turbo as string
-    but it outputs: AutomaticSpeechRecognitionOutput(text=" sometimes life   <- like this the class name still remains
-    in the response, ideally which should have started from "sometimes..." as in the given example  """
-    # Use find() to get the position of the start and end of the text
-    start_pos = result.find('text="') + len('text="')  # Start after 'text="'
-    end_pos = result.find('", chunks=None')  # End before '", chunks=None'
-    # Extract the text using slicing
-    cleaned_result = result[start_pos:end_pos]
-    print("Returning Cleaned Result: ", cleaned_result)
-    return cleaned_result
-def get_translation(text: str):
-    print('\n\nTranslating text: ', text, type(text))
     # Input payload
     data = {"text_input": text}
@@ -38,14 +23,14 @@ def get_translation(text: str):
     try:
         # Make a GET request
-        response = requests.post(constants.TRANSLATION_ENDPOINT, json=data, headers=headers)
         # Process response
         if response.status_code == 200:
             response_data = response.json()
-            print("Returning Translation")
             return response_data.get("output", "No output found.")
         else:
-            print("Some Error Occured During Translation Request")
             print(response)
             print(f"Error: {response.status_code}, {response.text}")
             return {"error_occured" : response.text}
@@ -61,7 +46,8 @@ def segments_to_chunks(segments):
     return chunks
-def get_image_prompts(text_input : List):
         # Example Pydantic model (e.g., Movie)
     class ImagePromptResponseSchema(BaseModel):
         image_prompts: List[str] = Field(
@@ -71,18 +57,19 @@ def get_image_prompts(text_input : List):
     extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
     chunks_count = len(text_input)
     chunks = "chunk: " + "\nchunk: ".join(text_input)
-    prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
-SYSTEM PROMPT:
-1. **Combine all chunks** to understand the complete context.
-2. **Identify the theme** and setting of the combined context.
-3. For each chunk, **generate a simple, context-aware image prompt** that fits the overall picture.
-   - Keep it clear and vivid, adding small details to enhance the visual.
 ### Example
 **Chunks**:
 1. A guy went to the jungle.
 2. He saw a lion.
@@ -96,7 +83,7 @@ SYSTEM PROMPT:
 NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
-TASK:  Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
     result = extractor.extract(prompt)
     return result.model_dump()   # returns dictionary version pydantic model

 import os
+def get_summarization(text: str):
+    print('\n\nSummarizing text: ', text, type(text))
     # Input payload
     data = {"text_input": text}
     try:
         # Make a GET request
+        response = requests.post(constants.SUMMARIZATION_ENDPOINT, json=data, headers=headers)
         # Process response
         if response.status_code == 200:
             response_data = response.json()
+            print("Returning Summarization")
             return response_data.get("output", "No output found.")
         else:
+            print("Some Error Occured During Summarization Request")
             print(response)
             print(f"Error: {response.status_code}, {response.text}")
             return {"error_occured" : response.text}
     return chunks
+def get_image_prompts(text_input : List, summary):
+    print(f"summary: {summary}")
         # Example Pydantic model (e.g., Movie)
     class ImagePromptResponseSchema(BaseModel):
         image_prompts: List[str] = Field(
     extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
     chunks_count = len(text_input)
     chunks = "chunk: " + "\nchunk: ".join(text_input)
+    prompt = f"""
+ROLE: You are a Highly Experienced Image Prompt Sythesizer
+SYSTEM PROMPT:  Given the Overall Summary and All Chunks of the Text
+1. Use Summary and Combine all chunks to understand the complete context
+3. **Identify the theme** and setting of the complete text
+4. For each chunk, **generate a simple, context-aware image prompt** that fits the overall picture.
+5. Keep Image Style as Hyper-Realistic (MUST BE FOLLOWED)
 ### Example
+summary: this text is a story of guy who went to jungle and a lion
 **Chunks**:
 1. A guy went to the jungle.
 2. He saw a lion.
 NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
+TASK:  Here is the summary: {summary}\n\n and \n\n Total of {chunks_count} chunks, Generate an Image Prompt Each per chunk\n\n {chunks}"""
     result = extractor.extract(prompt)
     return result.model_dump()   # returns dictionary version pydantic model