Spaces:
Sleeping
Sleeping
adjusted_segments
Browse files- app.py +44 -21
- requirements.txt +2 -1
app.py
CHANGED
@@ -8,6 +8,9 @@ from openai import OpenAI
|
|
8 |
from groq import Groq
|
9 |
import uuid
|
10 |
from gtts import gTTS
|
|
|
|
|
|
|
11 |
|
12 |
from youtube_transcript_api import YouTubeTranscriptApi
|
13 |
from youtube_transcript_api._errors import NoTranscriptFound
|
@@ -377,32 +380,52 @@ def generate_transcription(video_id):
|
|
377 |
}],
|
378 |
'outtmpl': outtmpl,
|
379 |
}
|
380 |
-
|
381 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
382 |
ydl.download([youtube_url])
|
383 |
|
384 |
audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
}
|
403 |
-
for item in segments
|
404 |
-
]
|
405 |
-
return transcription
|
406 |
|
407 |
def process_transcript_and_screenshots(video_id):
|
408 |
print("====process_transcript_and_screenshots====")
|
|
|
8 |
from groq import Groq
|
9 |
import uuid
|
10 |
from gtts import gTTS
|
11 |
+
import math
|
12 |
+
from pydub import AudioSegment
|
13 |
+
|
14 |
|
15 |
from youtube_transcript_api import YouTubeTranscriptApi
|
16 |
from youtube_transcript_api._errors import NoTranscriptFound
|
|
|
380 |
}],
|
381 |
'outtmpl': outtmpl,
|
382 |
}
|
383 |
+
|
384 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
385 |
ydl.download([youtube_url])
|
386 |
|
387 |
audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
|
388 |
+
full_audio = AudioSegment.from_mp3(audio_path)
|
389 |
+
|
390 |
+
max_part_duration = 10 * 60 * 1000 # 10 minutes
|
391 |
+
full_duration = len(full_audio) # in milliseconds
|
392 |
+
parts = math.ceil(full_duration / max_part_duration)
|
393 |
+
print(f"parts: {parts}")
|
394 |
+
transcription = []
|
395 |
+
|
396 |
+
for i in range(parts):
|
397 |
+
print(f"== i: {i}==")
|
398 |
+
start_time = i * max_part_duration
|
399 |
+
end_time = min((i + 1) * max_part_duration, full_duration)
|
400 |
+
print(f"time: {start_time/1000} - {end_time/1000}")
|
401 |
+
chunk = full_audio[start_time:end_time]
|
402 |
+
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
403 |
+
chunk.export(chunk_path, format=codec_name)
|
404 |
+
|
405 |
+
with open(chunk_path, "rb") as chunk_file:
|
406 |
+
response = OPEN_AI_CLIENT.audio.transcriptions.create(
|
407 |
+
model="whisper-1",
|
408 |
+
file=chunk_file,
|
409 |
+
response_format="verbose_json",
|
410 |
+
timestamp_granularities=["segment"],
|
411 |
+
prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
|
412 |
+
)
|
413 |
+
|
414 |
+
# Adjusting the timestamps for the chunk based on its position in the full audio
|
415 |
+
adjusted_segments = [{
|
416 |
+
'text': segment['text'],
|
417 |
+
'start': math.ceil(segment['start'] + start_time / 1000.0), # Converting milliseconds to seconds
|
418 |
+
'end': math.ceil(segment['end'] + start_time / 1000.0),
|
419 |
+
'duration': math.ceil(segment['end'] - segment['start'])
|
420 |
+
} for segment in response.segments]
|
421 |
|
422 |
+
transcription.extend(adjusted_segments)
|
423 |
+
|
424 |
+
# Remove temporary chunk files after processing
|
425 |
+
os.remove(chunk_path)
|
426 |
+
|
427 |
+
return transcription
|
428 |
+
|
|
|
|
|
|
|
|
|
429 |
|
430 |
def process_transcript_and_screenshots(video_id):
|
431 |
print("====process_transcript_and_screenshots====")
|
requirements.txt
CHANGED
@@ -16,4 +16,5 @@ groq
|
|
16 |
yt_dlp
|
17 |
uuid
|
18 |
gtts
|
19 |
-
boto3
|
|
|
|
16 |
yt_dlp
|
17 |
uuid
|
18 |
gtts
|
19 |
+
boto3
|
20 |
+
pydub
|