wower99 commited on
Commit
4e4c3a4
·
1 Parent(s): 745e3b9

added summarization endpoint, improved prompt

Browse files
Files changed (3) hide show
  1. app.py +15 -24
  2. constants.py +1 -6
  3. utils.py +17 -30
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import tempfile
4
  import uuid
5
  import logging
6
- from utils import get_translation, get_image_prompts, segments_to_chunks, generate_images, generate_video
7
  import constants
8
  from groq import Groq
9
 
@@ -22,9 +22,10 @@ session_id = st.session_state.session_id
22
 
23
  # Initialize state variables if not already set
24
  state_variables = [
25
- 'transcript_visible', 'translation_visible', 'uploaded_file_name',
26
- 'audio', 'was_converted', 'transcript', 'translation',
27
- 'generated_video', 'image_prompts', 'generated_images', 'video_generated'
 
28
  ]
29
 
30
  for var in state_variables:
@@ -59,7 +60,6 @@ if audio_file:
59
  st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
60
  st.session_state[f'audio_{session_id}'] = audio_file
61
  st.session_state[f'transcript_{session_id}'] = None
62
- st.session_state[f'translation_{session_id}'] = None
63
  st.session_state[f'image_prompts_{session_id}'] = None
64
  st.session_state[f'generated_images_{session_id}'] = None # Reset image generation state
65
  st.session_state[f'generated_video_{session_id}'] = None # Reset generated video state
@@ -86,12 +86,6 @@ if audio_file:
86
  logger.error(f"Error during transcription: {e}")
87
  st.error("An error occurred during transcription.")
88
 
89
- # Translation logic
90
- if st.session_state[f'transcript_{session_id}'] and st.session_state[f'translation_{session_id}'] is None:
91
- with st.spinner("Generating translation... Please wait."):
92
- st.session_state[f'translation_{session_id}'] = get_translation(st.session_state[f'transcript_{session_id}'])
93
- logger.info("Translation generated successfully.")
94
-
95
  st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
96
 
97
  # Toggle transcript visibility
@@ -102,22 +96,19 @@ if audio_file:
102
  st.write("### Transcription:")
103
  st.write(st.session_state[f'transcript_{session_id}'])
104
 
105
- # Toggle translation visibility
106
- toggle_translation = st.checkbox("Show Translation", value=st.session_state[f'translation_visible_{session_id}'], key="toggle_translation")
107
- st.session_state[f'translation_visible_{session_id}'] = toggle_translation
108
-
109
- if st.session_state[f'translation_visible_{session_id}']:
110
- st.write("### Translation:")
111
- st.write(st.session_state[f'translation_{session_id}'])
112
 
113
  # Image generation logic
114
- if st.session_state[f'translation_{session_id}'] and st.session_state[f'image_prompts_{session_id}'] is None:
115
  with st.spinner("Generating image prompts... Please wait."):
116
- if 'Already in English' in st.session_state[f'translation_{session_id}']:
117
- st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
118
- st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(segments_to_chunks(st.session_state[f'segments_{session_id}']))['image_prompts']
119
- else:
120
- st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(segments_to_chunks(st.session_state[f'segments_{session_id}']))['image_prompts']
121
  logger.info("Image prompts generated successfully.")
122
 
123
  # Ensure that generated_images is always a list
 
3
  import tempfile
4
  import uuid
5
  import logging
6
+ from utils import get_summarization, get_image_prompts, segments_to_chunks, generate_images, generate_video
7
  import constants
8
  from groq import Groq
9
 
 
22
 
23
  # Initialize state variables if not already set
24
  state_variables = [
25
+ 'transcript_visible', 'uploaded_file_name',
26
+ 'audio', 'was_converted', 'transcript',
27
+ 'generated_video', 'image_prompts', 'generated_images', 'video_generated',
28
+ 'summary' # Added summary state variable
29
  ]
30
 
31
  for var in state_variables:
 
60
  st.session_state[f'uploaded_file_name_{session_id}'] = audio_file.name
61
  st.session_state[f'audio_{session_id}'] = audio_file
62
  st.session_state[f'transcript_{session_id}'] = None
 
63
  st.session_state[f'image_prompts_{session_id}'] = None
64
  st.session_state[f'generated_images_{session_id}'] = None # Reset image generation state
65
  st.session_state[f'generated_video_{session_id}'] = None # Reset generated video state
 
86
  logger.error(f"Error during transcription: {e}")
87
  st.error("An error occurred during transcription.")
88
 
 
 
 
 
 
 
89
  st.audio(st.session_state[f'audio_{session_id}'], format=f"audio/{audio_file.type}")
90
 
91
  # Toggle transcript visibility
 
96
  st.write("### Transcription:")
97
  st.write(st.session_state[f'transcript_{session_id}'])
98
 
99
+ # Summarization logic (not displayed on UI)
100
+ if st.session_state[f'transcript_{session_id}'] and st.session_state[f'summary_{session_id}'] is None:
101
+ with st.spinner("Generating summary... Please wait."):
102
+ st.session_state[f'summary_{session_id}'] = get_summarization(st.session_state[f'transcript_{session_id}'])
103
+ logger.info("Summary generated successfully.")
 
 
104
 
105
  # Image generation logic
106
+ if st.session_state[f'transcript_{session_id}'] and st.session_state[f'image_prompts_{session_id}'] is None:
107
  with st.spinner("Generating image prompts... Please wait."):
108
+ st.session_state[f'image_prompts_{session_id}'] = get_image_prompts(
109
+ segments_to_chunks(st.session_state[f'segments_{session_id}']),
110
+ st.session_state[f'summary_{session_id}']
111
+ )['image_prompts']
 
112
  logger.info("Image prompts generated successfully.")
113
 
114
  # Ensure that generated_images is always a list
constants.py CHANGED
@@ -4,13 +4,8 @@ import os
4
  load_dotenv()
5
 
6
  HF_TOKEN = os.getenv("HF_TOKEN", None)
7
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
8
 
9
- AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
10
-
11
-
12
- TRANSLATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
13
- PROMPT_GENERATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/get-image-prompts"
14
  IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
15
 
16
  # Supported formats
 
4
  load_dotenv()
5
 
6
  HF_TOKEN = os.getenv("HF_TOKEN", None)
 
7
 
8
+ SUMMARIZATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.space/generate"
 
 
 
 
9
  IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
10
 
11
  # Supported formats
utils.py CHANGED
@@ -13,23 +13,8 @@ import tempfile
13
  import os
14
 
15
 
16
- def clean_response(result):
17
- print("\n\nStarted Cleaning Response")
18
- """A temporary fix to the output of predict which returns output of openai-whisper-large-v3-turbo as string
19
- but it outputs: AutomaticSpeechRecognitionOutput(text=" sometimes life <- like this the class name still remains
20
- in the response, ideally which should have started from "sometimes..." as in the given example """
21
- # Use find() to get the position of the start and end of the text
22
- start_pos = result.find('text="') + len('text="') # Start after 'text="'
23
- end_pos = result.find('", chunks=None') # End before '", chunks=None'
24
-
25
- # Extract the text using slicing
26
- cleaned_result = result[start_pos:end_pos]
27
- print("Returning Cleaned Result: ", cleaned_result)
28
- return cleaned_result
29
-
30
-
31
- def get_translation(text: str):
32
- print('\n\nTranslating text: ', text, type(text))
33
  # Input payload
34
  data = {"text_input": text}
35
 
@@ -38,14 +23,14 @@ def get_translation(text: str):
38
 
39
  try:
40
  # Make a GET request
41
- response = requests.post(constants.TRANSLATION_ENDPOINT, json=data, headers=headers)
42
  # Process response
43
  if response.status_code == 200:
44
  response_data = response.json()
45
- print("Returning Translation")
46
  return response_data.get("output", "No output found.")
47
  else:
48
- print("Some Error Occured During Translation Request")
49
  print(response)
50
  print(f"Error: {response.status_code}, {response.text}")
51
  return {"error_occured" : response.text}
@@ -61,7 +46,8 @@ def segments_to_chunks(segments):
61
  return chunks
62
 
63
 
64
- def get_image_prompts(text_input : List):
 
65
  # Example Pydantic model (e.g., Movie)
66
  class ImagePromptResponseSchema(BaseModel):
67
  image_prompts: List[str] = Field(
@@ -71,18 +57,19 @@ def get_image_prompts(text_input : List):
71
  extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
72
  chunks_count = len(text_input)
73
  chunks = "chunk: " + "\nchunk: ".join(text_input)
74
- prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
75
-
76
- SYSTEM PROMPT:
77
 
78
- 1. **Combine all chunks** to understand the complete context.
79
- 2. **Identify the theme** and setting of the combined context.
80
- 3. For each chunk, **generate a simple, context-aware image prompt** that fits the overall picture.
81
- - Keep it clear and vivid, adding small details to enhance the visual.
 
82
 
83
 
84
  ### Example
85
-
86
  **Chunks**:
87
  1. A guy went to the jungle.
88
  2. He saw a lion.
@@ -96,7 +83,7 @@ SYSTEM PROMPT:
96
 
97
  NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
98
 
99
- TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
100
  result = extractor.extract(prompt)
101
  return result.model_dump() # returns dictionary version pydantic model
102
 
 
13
  import os
14
 
15
 
16
+ def get_summarization(text: str):
17
+ print('\n\nSummarizing text: ', text, type(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Input payload
19
  data = {"text_input": text}
20
 
 
23
 
24
  try:
25
  # Make a GET request
26
+ response = requests.post(constants.SUMMARIZATION_ENDPOINT, json=data, headers=headers)
27
  # Process response
28
  if response.status_code == 200:
29
  response_data = response.json()
30
+ print("Returning Summarization")
31
  return response_data.get("output", "No output found.")
32
  else:
33
+ print("Some Error Occured During Summarization Request")
34
  print(response)
35
  print(f"Error: {response.status_code}, {response.text}")
36
  return {"error_occured" : response.text}
 
46
  return chunks
47
 
48
 
49
+ def get_image_prompts(text_input : List, summary):
50
+ print(f"summary: {summary}")
51
  # Example Pydantic model (e.g., Movie)
52
  class ImagePromptResponseSchema(BaseModel):
53
  image_prompts: List[str] = Field(
 
57
  extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
58
  chunks_count = len(text_input)
59
  chunks = "chunk: " + "\nchunk: ".join(text_input)
60
+ prompt = f"""
61
+
62
+ ROLE: You are a Highly Experienced Image Prompt Sythesizer
63
 
64
+ SYSTEM PROMPT: Given the Overall Summary and All Chunks of the Text
65
+ 1. Use Summary and Combine all chunks to understand the complete context
66
+ 3. **Identify the theme** and setting of the complete text
67
+ 4. For each chunk, **generate a simple, context-aware image prompt** that fits the overall picture.
68
+ 5. Keep Image Style as Hyper-Realistic (MUST BE FOLLOWED)
69
 
70
 
71
  ### Example
72
+ summary: this text is a story of guy who went to jungle and a lion
73
  **Chunks**:
74
  1. A guy went to the jungle.
75
  2. He saw a lion.
 
83
 
84
  NOTE: Never write a prompt that can generate NSFW images, or any other explicit content, use safe and appropriate prompts
85
 
86
+ TASK: Here is the summary: {summary}\n\n and \n\n Total of {chunks_count} chunks, Generate an Image Prompt Each per chunk\n\n {chunks}"""
87
  result = extractor.extract(prompt)
88
  return result.model_dump() # returns dictionary version pydantic model
89