Spaces:

JunyiAcademy
/

vaitor2

Sleeping

App Files Files Community

youngtsai commited on Mar 31, 2024

Commit

1630bbe

1 Parent(s): 48d17f9

adjusted_segments

Browse files

Files changed (2) hide show

app.py +44 -21
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ from openai import OpenAI
 from groq import Groq
 import uuid
 from gtts import gTTS
 from youtube_transcript_api import YouTubeTranscriptApi
 from youtube_transcript_api._errors import NoTranscriptFound
@@ -377,32 +380,52 @@ def generate_transcription(video_id):
         }],
         'outtmpl': outtmpl,
     }
-    print("===download video mp3===")
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([youtube_url])
     audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
-    print("===transcription by open ai===")
-    with open(audio_path, "rb") as audio_file:
-        srt_content = OPEN_AI_CLIENT.audio.transcriptions.create(
-            model="whisper-1",
-            file=audio_file,
-            response_format="verbose_json",
-            timestamp_granularities=["segment"],
-            prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
-        )
-    # get segments
-    segments = srt_content.segments
-    transcription = [
-        {
-            "text": item["text"],
-            "start": int(item["start"]),
-            "duration": int(item["end"] - item["start"])
-        }
-        for item in segments
-    ]
-    return transcription
 def process_transcript_and_screenshots(video_id):
     print("====process_transcript_and_screenshots====")

 from groq import Groq
 import uuid
 from gtts import gTTS
+import math
+from pydub import AudioSegment
 from youtube_transcript_api import YouTubeTranscriptApi
 from youtube_transcript_api._errors import NoTranscriptFound
         }],
         'outtmpl': outtmpl,
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([youtube_url])
     audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
+    full_audio = AudioSegment.from_mp3(audio_path)
+    max_part_duration = 10 * 60 * 1000  # 10 minutes
+    full_duration = len(full_audio)  # in milliseconds
+    parts = math.ceil(full_duration / max_part_duration)
+    print(f"parts: {parts}")
+    transcription = []
+    for i in range(parts):
+        print(f"== i: {i}==")
+        start_time = i * max_part_duration
+        end_time = min((i + 1) * max_part_duration, full_duration)
+        print(f"time: {start_time/1000} - {end_time/1000}")
+        chunk = full_audio[start_time:end_time]
+        chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
+        chunk.export(chunk_path, format=codec_name)
+        with open(chunk_path, "rb") as chunk_file:
+            response = OPEN_AI_CLIENT.audio.transcriptions.create(
+                model="whisper-1",
+                file=chunk_file,
+                response_format="verbose_json",
+                timestamp_granularities=["segment"],
+                prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
+            )
+            # Adjusting the timestamps for the chunk based on its position in the full audio
+            adjusted_segments = [{
+                'text': segment['text'],
+                'start': math.ceil(segment['start'] + start_time / 1000.0),  # Converting milliseconds to seconds
+                'end': math.ceil(segment['end'] + start_time / 1000.0),
+                'duration': math.ceil(segment['end'] - segment['start'])
+            } for segment in response.segments]
+            transcription.extend(adjusted_segments)
+        # Remove temporary chunk files after processing
+        os.remove(chunk_path)
+    return transcription
 def process_transcript_and_screenshots(video_id):
     print("====process_transcript_and_screenshots====")

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ groq
 yt_dlp
 uuid
 gtts
-boto3

 yt_dlp
 uuid
 gtts
+boto3
+pydub