# install packages # !pip install --upgrade -q ipython-autotime # %load_ext autotime # Download youtube video # !pip install -q pytube # youtube video download function import os from pytube import YouTube def progress_function(stream, chunk, bytes_remaining): total_size = stream.filesize bytes_downloaded = total_size - bytes_remaining percentage_of_completion = bytes_downloaded / total_size * 100 print(f"Downloaded {percentage_of_completion}%") def youtube_download(video_url): yt = YouTube(video_url, on_progress_callback=progress_function) # get video title video_title = yt.title print(f"Downloading video: {video_title}") stream = yt.streams.get_highest_resolution() # get video default name default_filename = stream.default_filename stream.download() return default_filename # use insanely-fast-whisper # !pip install --upgrade -q transformers optimum accelerate pyannote.audio import re import json import torch from transformers import pipeline from pyannote.audio import Pipeline # transfer srt to plain text import json def seconds_to_hms(seconds): # Simple conversion of seconds to HH:MM:SS format hours, remainder = divmod(seconds, 3600) minutes, seconds = divmod(remainder, 60) return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" def transcript_json2txt(segmented_transcript,file_path): # with open(file_path, 'r') as file: # formatted_dialogue = json.load(file) # Generating the dialogue text formatted_dialogue = segmented_transcript dialogue_text = "" for dialogue in formatted_dialogue: # Converting start time to HH:MM:SS format start_time = seconds_to_hms(dialogue['timestamp'][0]) speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker") # Formatting speaker name text = dialogue.get('text',"").strip() # Removing any leading/trailing whitespaces from the text # Constructing each dialogue entry dialogue_text += f"{start_time}, {speaker}: {text}\n\n" # Checking the first part of the generated dialogue text print("preview txt...") print('---------------------------------\n') print(dialogue_text[:500]) # Displaying the first 500 characters for review # Save the dialogue text to a file output_txt_file_path = file_path.replace('.json','.txt') with open(output_txt_file_path, 'w',encoding="utf8") as file: file.write(dialogue_text) print( f"Voila!✨ Your file has been transcribed go check it out over here 👉 {output_txt_file_path}" ) return dialogue_text # transcript function model_name = "openai/whisper-large-v3" flash = False # Set to True to use Flash Attention 2 print('---------------------------------') print('load pipe...') print('---------------------------------') # Initialize the pipeline pipe = pipeline( "automatic-speech-recognition", model=model_name, torch_dtype=torch.float16, # low_cpu_mem_usage=True, device='cuda:0', model_kwargs={"use_flash_attention_2": flash}, ) def transcript(file_path,pipe = pipe): pattern = '\.mp4|\.wav|\.mp3' transcript_path = re.sub(pattern,'.json',file_path) device_id = "0" # or "mps" for Macs with Apple Silicon device = "cuda" # or "mps" for Macs with Apple Silicon task = "transcribe" # or "translate" language = 'Chinese' # Whisper auto-detects the language batch_size = 24 timestamp = "chunk" # or "word" diarization_model = "pyannote/speaker-diarization-3.1" # Transcribe the audio print('Transcribing...') print('---------------------------------\n') outputs = pipe( file_path, chunk_length_s=30, batch_size=batch_size, # generate_kwargs={"task": task, "language": language}, generate_kwargs={"task": task}, return_timestamps=True ) # Save or display the output print('Saving transcript...') print('---------------------------------\n') with open(transcript_path, "w", encoding="utf8") as fp: json.dump(outputs, fp, ensure_ascii=False) print( f"Voila!✨ Your file has been transcribed go check it out over here 👉 {transcript_path}" ) # save to transcript txt file transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path) transcript_txt_path = transcript_path.replace('.json','.txt') # save to srt file # Function to convert time in seconds to SRT time format def convert_to_srt_time(timestamp): hours = int(timestamp // 3600) minutes = int((timestamp % 3600) // 60) seconds = int(timestamp % 60) milliseconds = int((timestamp % 1) * 1000) return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" # Creating the SRT content srt_content = "" for index, entry in enumerate(outputs['chunks']): try: start_time = convert_to_srt_time(entry['timestamp'][0]) end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1) srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n" except Exception as e: print(e) print(entry) # Saving the SRT content to a file srt_file_path = transcript_path.replace('.json','.srt') # srt_file_path = '/kaggle/working/6-revolution_transcript.srt' with open(srt_file_path, 'w',encoding="utf8") as file: file.write(srt_content) print( f"Voila!✨ Your file has been transcribed go check it out over here 👉 {srt_file_path}" ) return transcript_txt,srt_file_path # youtube transcript function def transcript_youtube(url): # download youtube video default_filename = youtube_download(url) file_path = os.path.join(os.getcwd(),default_filename) transcript_txt,srt_file_path = transcript(file_path) return transcript_txt[:500],file_path,srt_file_path # test youtube transcript # url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg" # transcript_youtube(url) # gradio interface # !pip install --upgrade -q gradio import gradio as gr title = "Fastly audio transcript" description = "Input your audio or record your audio" def audio_func(audio_file): return f"This is the audio file path: {audio_file}" def file_func(file_path): return f"This is the file path: {file_path}" audio_input = gr.Audio(type='filepath') file_input = gr.File(type="filepath") youtube_interface = gr.Interface( fn = transcript_youtube, inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"), outputs = [ gr.Textbox(label="Transcript preview", lines=3), gr.File(label="Download Video"), gr.File(label="Srt file") ], title = "Fastly Youtube Video Transcrip", description = "Transcript Any Youtube video in Seconds!!!" ) audio_interface = gr.Interface( fn=audio_func, inputs=audio_input, outputs=[gr.Textbox(label="Greeting",lines=3)], title = title, description = description ) file_interface = gr.Interface( fn=file_func, inputs=file_input, outputs=[gr.Textbox(label="Greeting",lines=3)], title = title, description = description ) demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"]) demo.queue(max_size = 20) demo.launch(share = True)