Spaces:
Running
Running
Upload 9 files
Browse files- utility/audio_generator.py +32 -0
- utility/background_video_generator.py +71 -0
- utility/conf.py +14 -0
- utility/render_engine.py +187 -0
- utility/script_generator.py +60 -0
- utility/text_audio_cleaner.py +44 -0
- utility/timed_captions_generator.py +71 -0
- utility/utils.py +34 -0
- utility/video_search_query_generator.py +117 -0
utility/audio_generator.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import edge_tts
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
|
7 |
+
async def generate_audio(text, outputFilename):
|
8 |
+
"""Generate audio from text using edge_tts
|
9 |
+
|
10 |
+
Args:
|
11 |
+
text (str): Text to convert to speech
|
12 |
+
outputFilename (str): Path to save the audio file
|
13 |
+
|
14 |
+
Raises:
|
15 |
+
Exception: If audio generation fails
|
16 |
+
"""
|
17 |
+
try:
|
18 |
+
# Ensure output directory exists
|
19 |
+
os.makedirs(os.path.dirname(outputFilename), exist_ok=True)
|
20 |
+
|
21 |
+
logger.info(f"Generating audio for text length: {len(text)}")
|
22 |
+
# Updated voice parameter below:
|
23 |
+
communicate = edge_tts.Communicate(text, "en-US-GuyNeural")
|
24 |
+
await communicate.save(outputFilename)
|
25 |
+
|
26 |
+
if not os.path.exists(outputFilename):
|
27 |
+
raise Exception(f"Failed to create audio file at {outputFilename}")
|
28 |
+
|
29 |
+
logger.info(f"Successfully generated audio at {outputFilename}")
|
30 |
+
except Exception as e:
|
31 |
+
logger.error(f"Error generating audio: {str(e)}")
|
32 |
+
raise Exception(f"Audio generation failed: {str(e)}")
|
utility/background_video_generator.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from utility.utils import log_response,LOG_TYPE_PEXEL
|
4 |
+
|
5 |
+
PEXELS_API_KEY = os.environ.get('PEXELS_KEY')
|
6 |
+
|
7 |
+
def search_videos(query_string, orientation_landscape=True):
|
8 |
+
|
9 |
+
url = "https://api.pexels.com/videos/search"
|
10 |
+
headers = {
|
11 |
+
"Authorization": PEXELS_API_KEY,
|
12 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
13 |
+
}
|
14 |
+
params = {
|
15 |
+
"query": query_string,
|
16 |
+
"orientation": "landscape" if orientation_landscape else "portrait",
|
17 |
+
"per_page": 15
|
18 |
+
}
|
19 |
+
|
20 |
+
response = requests.get(url, headers=headers, params=params)
|
21 |
+
json_data = response.json()
|
22 |
+
log_response(LOG_TYPE_PEXEL,query_string,response.json())
|
23 |
+
|
24 |
+
return json_data
|
25 |
+
|
26 |
+
|
27 |
+
def getBestVideo(query_string, orientation_landscape=True, used_vids=[]):
|
28 |
+
vids = search_videos(query_string, orientation_landscape)
|
29 |
+
videos = vids['videos'] # Extract the videos list from JSON
|
30 |
+
|
31 |
+
# Filter and extract videos with width and height as 1920x1080 for landscape or 1080x1920 for portrait
|
32 |
+
if orientation_landscape:
|
33 |
+
filtered_videos = [video for video in videos if video['width'] >= 1920 and video['height'] >= 1080 and video['width']/video['height'] == 16/9]
|
34 |
+
else:
|
35 |
+
filtered_videos = [video for video in videos if video['width'] >= 1080 and video['height'] >= 1920 and video['height']/video['width'] == 16/9]
|
36 |
+
|
37 |
+
# Sort the filtered videos by duration in ascending order
|
38 |
+
sorted_videos = sorted(filtered_videos, key=lambda x: abs(15-int(x['duration'])))
|
39 |
+
|
40 |
+
# Extract the top 3 videos' URLs
|
41 |
+
for video in sorted_videos:
|
42 |
+
for video_file in video['video_files']:
|
43 |
+
if orientation_landscape:
|
44 |
+
if video_file['width'] == 1920 and video_file['height'] == 1080:
|
45 |
+
if not (video_file['link'].split('.hd')[0] in used_vids):
|
46 |
+
return video_file['link']
|
47 |
+
else:
|
48 |
+
if video_file['width'] == 1080 and video_file['height'] == 1920:
|
49 |
+
if not (video_file['link'].split('.hd')[0] in used_vids):
|
50 |
+
return video_file['link']
|
51 |
+
print("NO LINKS found for this round of search with query :", query_string)
|
52 |
+
return None
|
53 |
+
|
54 |
+
|
55 |
+
def generate_video_url(timed_video_searches,video_server):
|
56 |
+
timed_video_urls = []
|
57 |
+
if video_server == "pexel":
|
58 |
+
used_links = []
|
59 |
+
for (t1, t2), search_terms in timed_video_searches:
|
60 |
+
url = ""
|
61 |
+
for query in search_terms:
|
62 |
+
|
63 |
+
url = getBestVideo(query, orientation_landscape=True, used_vids=used_links)
|
64 |
+
if url:
|
65 |
+
used_links.append(url.split('.hd')[0])
|
66 |
+
break
|
67 |
+
timed_video_urls.append([[t1, t2], url])
|
68 |
+
elif video_server == "stable_diffusion":
|
69 |
+
timed_video_urls = get_images_for_video(timed_video_searches)
|
70 |
+
|
71 |
+
return timed_video_urls
|
utility/conf.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
|
7 |
+
# ImageMagick configuration
|
8 |
+
IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
|
9 |
+
|
10 |
+
# Validate ImageMagick path
|
11 |
+
if not Path(IMAGEMAGICK_BINARY).exists():
|
12 |
+
error_msg = f"ImageMagick not found at {IMAGEMAGICK_BINARY}. Please install ImageMagick and update the path."
|
13 |
+
logger.error(error_msg)
|
14 |
+
raise FileNotFoundError(error_msg)
|
utility/render_engine.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import zipfile
|
5 |
+
import platform
|
6 |
+
import subprocess
|
7 |
+
import logging
|
8 |
+
from pathlib import Path
|
9 |
+
from moviepy.editor import (AudioFileClip, CompositeVideoClip, CompositeAudioClip, ImageClip,
|
10 |
+
TextClip, VideoFileClip)
|
11 |
+
from moviepy.audio.fx.audio_loop import audio_loop
|
12 |
+
from moviepy.audio.fx.audio_normalize import audio_normalize
|
13 |
+
import requests
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
def download_file(url, filename):
|
19 |
+
with open(filename, 'wb') as f:
|
20 |
+
headers = {
|
21 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
22 |
+
}
|
23 |
+
response = requests.get(url, headers=headers)
|
24 |
+
f.write(response.content)
|
25 |
+
|
26 |
+
def search_program(program_name):
|
27 |
+
try:
|
28 |
+
search_cmd = "where" if platform.system() == "Windows" else "which"
|
29 |
+
return subprocess.check_output([search_cmd, program_name]).decode().strip()
|
30 |
+
except subprocess.CalledProcessError:
|
31 |
+
return None
|
32 |
+
|
33 |
+
def get_program_path(program_name):
|
34 |
+
program_path = search_program(program_name)
|
35 |
+
return program_path
|
36 |
+
|
37 |
+
def get_output_media(audio_file_path, timed_captions, background_video_data, video_server):
|
38 |
+
"""Generate final video with audio and captions
|
39 |
+
|
40 |
+
Args:
|
41 |
+
audio_file_path (str): Path to audio file
|
42 |
+
timed_captions (list): List of timed captions
|
43 |
+
background_video_data (list): List of background video data
|
44 |
+
video_server (str): Video server URL
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
str: Path to output video file
|
48 |
+
|
49 |
+
Raises:
|
50 |
+
Exception: If video rendering fails
|
51 |
+
"""
|
52 |
+
OUTPUT_FILE_NAME = "rendered_video.mp4"
|
53 |
+
from utility.conf import IMAGEMAGICK_BINARY
|
54 |
+
from moviepy.config import change_settings
|
55 |
+
|
56 |
+
try:
|
57 |
+
# Validate input files
|
58 |
+
if not Path(audio_file_path).exists():
|
59 |
+
raise FileNotFoundError(f"Audio file not found at {audio_file_path}")
|
60 |
+
|
61 |
+
try:
|
62 |
+
change_settings({"IMAGEMAGICK_BINARY": IMAGEMAGICK_BINARY})
|
63 |
+
logger.info(f"Using ImageMagick from: {IMAGEMAGICK_BINARY}")
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"Error configuring ImageMagick: {str(e)}")
|
66 |
+
raise Exception(f"ImageMagick configuration failed: {str(e)}")
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Error in initial setup: {str(e)}")
|
69 |
+
raise Exception(f"Initial setup failed: {str(e)}")
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
visual_clips = []
|
77 |
+
for (t1, t2), video_url in background_video_data:
|
78 |
+
try:
|
79 |
+
# Download the video file
|
80 |
+
video_filename = tempfile.NamedTemporaryFile(delete=False).name
|
81 |
+
logger.info(f"Downloading video from {video_url}")
|
82 |
+
download_file(video_url, video_filename)
|
83 |
+
|
84 |
+
if not Path(video_filename).exists():
|
85 |
+
raise FileNotFoundError(f"Failed to download video from {video_url}")
|
86 |
+
|
87 |
+
# Create VideoFileClip from the downloaded file
|
88 |
+
video_clip = VideoFileClip(video_filename)
|
89 |
+
if video_clip is None:
|
90 |
+
raise ValueError(f"Failed to create video clip from {video_filename}")
|
91 |
+
|
92 |
+
video_clip = video_clip.set_start(t1)
|
93 |
+
video_clip = video_clip.set_end(t2)
|
94 |
+
visual_clips.append(video_clip)
|
95 |
+
logger.info(f"Added video clip from {video_url} ({t1}-{t2}s)")
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Error processing video {video_url}: {str(e)}")
|
99 |
+
raise Exception(f"Failed to process video {video_url}: {str(e)}")
|
100 |
+
|
101 |
+
|
102 |
+
audio_clips = []
|
103 |
+
try:
|
104 |
+
# Verify audio file exists and is valid
|
105 |
+
if not os.path.exists(audio_file_path):
|
106 |
+
raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
|
107 |
+
|
108 |
+
audio_file_clip = AudioFileClip(audio_file_path)
|
109 |
+
if audio_file_clip is None:
|
110 |
+
raise ValueError(f"Failed to create audio clip from {audio_file_path}")
|
111 |
+
|
112 |
+
# Normalize audio volume
|
113 |
+
audio_file_clip = audio_normalize(audio_file_clip)
|
114 |
+
|
115 |
+
# Verify audio duration
|
116 |
+
if audio_file_clip.duration <= 0:
|
117 |
+
raise ValueError("Audio file has zero or negative duration")
|
118 |
+
|
119 |
+
audio_clips.append(audio_file_clip)
|
120 |
+
logger.info(f"Added audio clip from {audio_file_path} (duration: {audio_file_clip.duration:.2f}s)")
|
121 |
+
|
122 |
+
except Exception as e:
|
123 |
+
logger.error(f"Error processing audio: {str(e)}")
|
124 |
+
raise Exception(f"Failed to process audio: {str(e)}")
|
125 |
+
|
126 |
+
|
127 |
+
for (t1, t2), text in timed_captions:
|
128 |
+
try:
|
129 |
+
# Updated caption style: changed font, fontsize, and position.
|
130 |
+
text_clip = TextClip(
|
131 |
+
txt=text,
|
132 |
+
fontsize=70,
|
133 |
+
font="Arial-Bold",
|
134 |
+
color="white",
|
135 |
+
stroke_width=2,
|
136 |
+
stroke_color="black",
|
137 |
+
method="label"
|
138 |
+
)
|
139 |
+
# Set the text to appear at the bottom-center
|
140 |
+
text_clip = text_clip.set_start(t1).set_end(t2).set_position(('center','bottom'))
|
141 |
+
visual_clips.append(text_clip)
|
142 |
+
logger.info(f"Added text clip: {text} ({t1}-{t2}s)")
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"Error creating text clip: {str(e)}")
|
145 |
+
raise Exception(f"Failed to create text clip: {str(e)}")
|
146 |
+
|
147 |
+
|
148 |
+
try:
|
149 |
+
if not visual_clips:
|
150 |
+
raise ValueError("No visual clips available for rendering")
|
151 |
+
|
152 |
+
video = CompositeVideoClip(visual_clips)
|
153 |
+
|
154 |
+
if audio_clips:
|
155 |
+
audio = CompositeAudioClip(audio_clips)
|
156 |
+
# Ensure video duration matches audio and update video with audio properly
|
157 |
+
if video.duration < audio.duration:
|
158 |
+
last_clip = visual_clips[-1]
|
159 |
+
extended_clip = last_clip.set_end(audio.duration)
|
160 |
+
visual_clips[-1] = extended_clip
|
161 |
+
video = CompositeVideoClip(visual_clips)
|
162 |
+
|
163 |
+
video = video.set_duration(audio.duration)
|
164 |
+
# Updated audio application using set_audio
|
165 |
+
video = video.set_audio(audio)
|
166 |
+
logger.info(f"Audio synchronized with video (duration: {video.duration:.2f}s)")
|
167 |
+
|
168 |
+
|
169 |
+
logger.info(f"Rendering final video to {OUTPUT_FILE_NAME}")
|
170 |
+
video.write_videofile(OUTPUT_FILE_NAME, codec='libx264', audio_codec='aac', fps=25, preset='veryfast')
|
171 |
+
|
172 |
+
# Clean up downloaded files
|
173 |
+
for (t1, t2), video_url in background_video_data:
|
174 |
+
video_filename = tempfile.NamedTemporaryFile(delete=False).name
|
175 |
+
if Path(video_filename).exists():
|
176 |
+
os.remove(video_filename)
|
177 |
+
logger.info(f"Cleaned up temporary file: {video_filename}")
|
178 |
+
|
179 |
+
if not Path(OUTPUT_FILE_NAME).exists():
|
180 |
+
raise FileNotFoundError(f"Failed to create output video at {OUTPUT_FILE_NAME}")
|
181 |
+
|
182 |
+
logger.info(f"Successfully rendered video at {OUTPUT_FILE_NAME}")
|
183 |
+
return OUTPUT_FILE_NAME
|
184 |
+
|
185 |
+
except Exception as e:
|
186 |
+
logger.error(f"Error rendering video: {str(e)}")
|
187 |
+
raise Exception(f"Video rendering failed: {str(e)}")
|
utility/script_generator.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
from groq import Groq
|
7 |
+
model = "mixtral-8x7b-32768"
|
8 |
+
client = Groq(
|
9 |
+
api_key=os.environ.get("GROQ_API_KEY"),
|
10 |
+
)
|
11 |
+
|
12 |
+
def generate_script(topic):
|
13 |
+
prompt = (
|
14 |
+
"""You are a seasoned content writer for a YouTube Shorts channel, specializing in facts videos.
|
15 |
+
Your facts shorts are concise, each lasting less than 50 seconds (approximately 140 words).
|
16 |
+
They are incredibly engaging and original. When a user requests a specific type of facts short, you will create it.
|
17 |
+
|
18 |
+
For instance, if the user asks for:
|
19 |
+
Weird facts
|
20 |
+
You would produce content like this:
|
21 |
+
|
22 |
+
Weird facts you don't know:
|
23 |
+
- Bananas are berries, but strawberries aren't.
|
24 |
+
- A single cloud can weigh over a million pounds.
|
25 |
+
- There's a species of jellyfish that is biologically immortal.
|
26 |
+
- Honey never spoils; archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still edible.
|
27 |
+
- The shortest war in history was between Britain and Zanzibar on August 27, 1896. Zanzibar surrendered after 38 minutes.
|
28 |
+
- Octopuses have three hearts and blue blood.
|
29 |
+
|
30 |
+
You are now tasked with creating the best short script based on the user's requested type of 'facts'.
|
31 |
+
|
32 |
+
Keep it brief, highly interesting, and unique.
|
33 |
+
|
34 |
+
Stictly output the script in a JSON format like below, and only provide a parsable JSON object with the key 'script'.
|
35 |
+
|
36 |
+
# Output
|
37 |
+
{"script": "Here is the script ..."}
|
38 |
+
"""
|
39 |
+
)
|
40 |
+
|
41 |
+
response = client.chat.completions.create(
|
42 |
+
model=model,
|
43 |
+
messages=[
|
44 |
+
{"role": "system", "content": prompt},
|
45 |
+
{"role": "user", "content": topic}
|
46 |
+
]
|
47 |
+
)
|
48 |
+
content = response.choices[0].message.content
|
49 |
+
try:
|
50 |
+
# Basic cleanup of common JSON formatting issues
|
51 |
+
content = content.strip()
|
52 |
+
# Parse JSON directly
|
53 |
+
response_dict = json.loads(content)
|
54 |
+
script = response_dict["script"]
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error parsing script: {e}")
|
57 |
+
print("Raw content:", content)
|
58 |
+
script = "Failed to generate script. Please try again."
|
59 |
+
|
60 |
+
return script
|
utility/text_audio_cleaner.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
from pydub import AudioSegment
|
4 |
+
|
5 |
+
def clean_text(text):
|
6 |
+
"""
|
7 |
+
Removes symbols and cleans the input text.
|
8 |
+
"""
|
9 |
+
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
10 |
+
text = text.strip() # Remove leading/trailing whitespace
|
11 |
+
return text
|
12 |
+
|
13 |
+
def process_audio(audio_path, output_path):
|
14 |
+
"""
|
15 |
+
Cleans and normalizes the audio.
|
16 |
+
"""
|
17 |
+
try:
|
18 |
+
audio = AudioSegment.from_file(audio_path)
|
19 |
+
# Simple normalization (you can add more sophisticated methods)
|
20 |
+
normalized_audio = audio.normalize()
|
21 |
+
normalized_audio.export(output_path, format="wav")
|
22 |
+
return output_path
|
23 |
+
except Exception as e:
|
24 |
+
print(f"Error processing audio: {e}")
|
25 |
+
return None
|
26 |
+
|
27 |
+
def convert_to_wav(input_file):
|
28 |
+
"""
|
29 |
+
Convert any audio file to WAV format.
|
30 |
+
"""
|
31 |
+
try:
|
32 |
+
# Load the audio file
|
33 |
+
audio = AudioSegment.from_file(input_file)
|
34 |
+
|
35 |
+
# Define the output WAV file path
|
36 |
+
output_wav = os.path.splitext(input_file)[0] + ".wav"
|
37 |
+
|
38 |
+
# Export the audio to WAV format
|
39 |
+
audio.export(output_wav, format="wav")
|
40 |
+
|
41 |
+
return output_wav
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error converting to WAV: {e}")
|
44 |
+
return None
|
utility/timed_captions_generator.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper_timestamped as whisper
|
2 |
+
from whisper_timestamped import load_model, transcribe_timestamped
|
3 |
+
import re
|
4 |
+
|
5 |
+
def generate_timed_captions(audio_filename,model_size="base"):
|
6 |
+
WHISPER_MODEL = load_model(model_size)
|
7 |
+
|
8 |
+
gen = transcribe_timestamped(WHISPER_MODEL, audio_filename, verbose=False, fp16=False)
|
9 |
+
|
10 |
+
return getCaptionsWithTime(gen)
|
11 |
+
|
12 |
+
def splitWordsBySize(words, maxCaptionSize):
|
13 |
+
|
14 |
+
halfCaptionSize = maxCaptionSize / 2
|
15 |
+
captions = []
|
16 |
+
while words:
|
17 |
+
caption = words[0]
|
18 |
+
words = words[1:]
|
19 |
+
while words and len(caption + ' ' + words[0]) <= maxCaptionSize:
|
20 |
+
caption += ' ' + words[0]
|
21 |
+
words = words[1:]
|
22 |
+
if len(caption) >= halfCaptionSize and words:
|
23 |
+
break
|
24 |
+
captions.append(caption)
|
25 |
+
return captions
|
26 |
+
|
27 |
+
def getTimestampMapping(whisper_analysis):
|
28 |
+
|
29 |
+
index = 0
|
30 |
+
locationToTimestamp = {}
|
31 |
+
for segment in whisper_analysis['segments']:
|
32 |
+
for word in segment['words']:
|
33 |
+
newIndex = index + len(word['text'])+1
|
34 |
+
locationToTimestamp[(index, newIndex)] = word['end']
|
35 |
+
index = newIndex
|
36 |
+
return locationToTimestamp
|
37 |
+
|
38 |
+
def cleanWord(word):
|
39 |
+
|
40 |
+
return re.sub(r'[^\w\s\-_"\'\']', '', word)
|
41 |
+
|
42 |
+
def interpolateTimeFromDict(word_position, d):
|
43 |
+
|
44 |
+
for key, value in d.items():
|
45 |
+
if key[0] <= word_position <= key[1]:
|
46 |
+
return value
|
47 |
+
return None
|
48 |
+
|
49 |
+
def getCaptionsWithTime(whisper_analysis, maxCaptionSize=15, considerPunctuation=False):
|
50 |
+
|
51 |
+
wordLocationToTime = getTimestampMapping(whisper_analysis)
|
52 |
+
position = 0
|
53 |
+
start_time = 0
|
54 |
+
CaptionsPairs = []
|
55 |
+
text = whisper_analysis['text']
|
56 |
+
|
57 |
+
if considerPunctuation:
|
58 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
59 |
+
words = [word for sentence in sentences for word in splitWordsBySize(sentence.split(), maxCaptionSize)]
|
60 |
+
else:
|
61 |
+
words = text.split()
|
62 |
+
words = [cleanWord(word) for word in splitWordsBySize(words, maxCaptionSize)]
|
63 |
+
|
64 |
+
for word in words:
|
65 |
+
position += len(word) + 1
|
66 |
+
end_time = interpolateTimeFromDict(position, wordLocationToTime)
|
67 |
+
if end_time and word:
|
68 |
+
CaptionsPairs.append(((start_time, end_time), word))
|
69 |
+
start_time = end_time
|
70 |
+
|
71 |
+
return CaptionsPairs
|
utility/utils.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
import json
|
4 |
+
|
5 |
+
# Log types
|
6 |
+
LOG_TYPE_GPT = "GPT"
|
7 |
+
LOG_TYPE_PEXEL = "PEXEL"
|
8 |
+
|
9 |
+
# log directory paths
|
10 |
+
DIRECTORY_LOG_GPT = ".logs/gpt_logs"
|
11 |
+
DIRECTORY_LOG_PEXEL = ".logs/pexel_logs"
|
12 |
+
|
13 |
+
# method to log response from pexel and openai
|
14 |
+
def log_response(log_type, query,response):
|
15 |
+
log_entry = {
|
16 |
+
"query": query,
|
17 |
+
"response": response,
|
18 |
+
"timestamp": datetime.now().isoformat()
|
19 |
+
}
|
20 |
+
if log_type == LOG_TYPE_GPT:
|
21 |
+
if not os.path.exists(DIRECTORY_LOG_GPT):
|
22 |
+
os.makedirs(DIRECTORY_LOG_GPT)
|
23 |
+
filename = '{}_gpt3.txt'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
|
24 |
+
filepath = os.path.join(DIRECTORY_LOG_GPT, filename)
|
25 |
+
with open(filepath, "w") as outfile:
|
26 |
+
outfile.write(json.dumps(log_entry) + '\n')
|
27 |
+
|
28 |
+
if log_type == LOG_TYPE_PEXEL:
|
29 |
+
if not os.path.exists(DIRECTORY_LOG_PEXEL):
|
30 |
+
os.makedirs(DIRECTORY_LOG_PEXEL)
|
31 |
+
filename = '{}_pexel.txt'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
|
32 |
+
filepath = os.path.join(DIRECTORY_LOG_PEXEL, filename)
|
33 |
+
with open(filepath, "w") as outfile:
|
34 |
+
outfile.write(json.dumps(log_entry) + '\n')
|
utility/video_search_query_generator.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
from datetime import datetime
|
6 |
+
from utility.utils import log_response,LOG_TYPE_GPT
|
7 |
+
|
8 |
+
from groq import Groq
|
9 |
+
model = "llama3-70b-8192"
|
10 |
+
client = Groq(
|
11 |
+
api_key=os.environ.get("GROQ_API_KEY"),
|
12 |
+
)
|
13 |
+
|
14 |
+
log_directory = ".logs/gpt_logs"
|
15 |
+
|
16 |
+
prompt = """# Instructions
|
17 |
+
|
18 |
+
Given the following video script and timed captions, extract three visually concrete and specific keywords for each time segment that can be used to search for background videos. The keywords should be short and capture the main essence of the sentence. They can be synonyms or related terms. If a caption is vague or general, consider the next timed caption for more context. If a keyword is a single word, try to return a two-word keyword that is visually concrete. If a time frame contains two or more important pieces of information, divide it into shorter time frames with one keyword each. Ensure that the time periods are strictly consecutive and cover the entire length of the video. Each keyword should cover between 2-4 seconds. The output should be in JSON format, like this: [[[t1, t2], ["keyword1", "keyword2", "keyword3"]], [[t2, t3], ["keyword4", "keyword5", "keyword6"]], ...]. Please handle all edge cases, such as overlapping time segments, vague or general captions, and single-word keywords.
|
19 |
+
|
20 |
+
For example, if the caption is 'The cheetah is the fastest land animal, capable of running at speeds up to 75 mph', the keywords should include 'cheetah running', 'fastest animal', and '75 mph'. Similarly, for 'The Great Wall of China is one of the most iconic landmarks in the world', the keywords should be 'Great Wall of China', 'iconic landmark', and 'China landmark'.
|
21 |
+
|
22 |
+
Important Guidelines:
|
23 |
+
|
24 |
+
Use only English in your text queries.
|
25 |
+
Each search string must depict something visual.
|
26 |
+
The depictions have to be extremely visually concrete, like rainy street, or cat sleeping.
|
27 |
+
'emotional moment' <= BAD, because it doesn't depict something visually.
|
28 |
+
'crying child' <= GOOD, because it depicts something visual.
|
29 |
+
The list must always contain the most relevant and appropriate query searches.
|
30 |
+
['Car', 'Car driving', 'Car racing', 'Car parked'] <= BAD, because it's 4 strings.
|
31 |
+
['Fast car'] <= GOOD, because it's 1 string.
|
32 |
+
['Un chien', 'une voiture rapide', 'une maison rouge'] <= BAD, because the text query is NOT in English.
|
33 |
+
|
34 |
+
Note: Your response should be the response only and no extra text or data.
|
35 |
+
"""
|
36 |
+
|
37 |
+
def fix_json(json_str):
|
38 |
+
# Replace typographical apostrophes with straight quotes
|
39 |
+
json_str = json_str.replace("β", "'")
|
40 |
+
# Replace any incorrect quotes (e.g., mixed single and double quotes)
|
41 |
+
json_str = json_str.replace("β", "\"").replace("β", "\"").replace("β", "\"").replace("β", "\"")
|
42 |
+
# Add escaping for quotes within the strings
|
43 |
+
json_str = json_str.replace('"you didn"t"', '"you didn\'t"')
|
44 |
+
return json_str
|
45 |
+
|
46 |
+
def getVideoSearchQueriesTimed(script,captions_timed):
|
47 |
+
end = captions_timed[-1][0][1]
|
48 |
+
try:
|
49 |
+
|
50 |
+
out = [[[0,0],""]]
|
51 |
+
while out[-1][0][1] != end:
|
52 |
+
content = call_OpenAI(script,captions_timed).replace("'",'"')
|
53 |
+
try:
|
54 |
+
out = json.loads(content)
|
55 |
+
except Exception as e:
|
56 |
+
print("content: \n", content, "\n\n")
|
57 |
+
print(e)
|
58 |
+
content = fix_json(content.replace("```json", "").replace("```", ""))
|
59 |
+
out = json.loads(content)
|
60 |
+
return out
|
61 |
+
except Exception as e:
|
62 |
+
print("error in response",e)
|
63 |
+
|
64 |
+
return None
|
65 |
+
|
66 |
+
def call_OpenAI(script,captions_timed):
|
67 |
+
user_content = """Script: {}
|
68 |
+
Timed Captions:{}
|
69 |
+
""".format(script,"".join(map(str,captions_timed)))
|
70 |
+
print("Content", user_content)
|
71 |
+
|
72 |
+
response = client.chat.completions.create(
|
73 |
+
model= model,
|
74 |
+
temperature=1,
|
75 |
+
messages=[
|
76 |
+
{"role": "system", "content": prompt},
|
77 |
+
{"role": "user", "content": user_content}
|
78 |
+
]
|
79 |
+
)
|
80 |
+
|
81 |
+
text = response.choices[0].message.content.strip()
|
82 |
+
text = re.sub('\s+', ' ', text)
|
83 |
+
print("Text", text)
|
84 |
+
log_response(LOG_TYPE_GPT,script,text)
|
85 |
+
return text
|
86 |
+
|
87 |
+
def merge_empty_intervals(segments):
|
88 |
+
if segments is None:
|
89 |
+
return []
|
90 |
+
|
91 |
+
merged = []
|
92 |
+
i = 0
|
93 |
+
while i < len(segments):
|
94 |
+
|
95 |
+
interval, url = segments[i]
|
96 |
+
if url is None:
|
97 |
+
# Find consecutive None intervals
|
98 |
+
j = i + 1
|
99 |
+
while j < len(segments) and segments[j][1] is None:
|
100 |
+
j += 1
|
101 |
+
|
102 |
+
# Merge consecutive None intervals with the previous valid URL
|
103 |
+
if i > 0:
|
104 |
+
prev_interval, prev_url = merged[-1]
|
105 |
+
if prev_url is not None and prev_interval[1] == interval[0]:
|
106 |
+
merged[-1] = [[prev_interval[0], segments[j-1][0][1]], prev_url]
|
107 |
+
else:
|
108 |
+
merged.append([interval, prev_url])
|
109 |
+
else:
|
110 |
+
merged.append([interval, None])
|
111 |
+
|
112 |
+
i = j
|
113 |
+
else:
|
114 |
+
merged.append([interval, url])
|
115 |
+
i += 1
|
116 |
+
|
117 |
+
return merged
|