Spaces:
Runtime error
Runtime error
# install packages | |
# !pip install --upgrade -q ipython-autotime | |
# %load_ext autotime | |
# Download youtube video | |
# !pip install -q pytube | |
# youtube video download function | |
import os | |
from pytube import YouTube | |
def progress_function(stream, chunk, bytes_remaining): | |
total_size = stream.filesize | |
bytes_downloaded = total_size - bytes_remaining | |
percentage_of_completion = bytes_downloaded / total_size * 100 | |
print(f"Downloaded {percentage_of_completion}%") | |
def youtube_download(video_url): | |
yt = YouTube(video_url, on_progress_callback=progress_function) | |
# get video title | |
video_title = yt.title | |
print(f"Downloading video: {video_title}") | |
stream = yt.streams.get_highest_resolution() | |
# get video default name | |
default_filename = stream.default_filename | |
stream.download() | |
return default_filename | |
# use insanely-fast-whisper | |
# !pip install --upgrade -q transformers optimum accelerate pyannote.audio | |
import re | |
import json | |
import torch | |
from transformers import pipeline | |
from pyannote.audio import Pipeline | |
# transfer srt to plain text | |
import json | |
def seconds_to_hms(seconds): | |
# Simple conversion of seconds to HH:MM:SS format | |
hours, remainder = divmod(seconds, 3600) | |
minutes, seconds = divmod(remainder, 60) | |
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" | |
def transcript_json2txt(segmented_transcript,file_path): | |
# with open(file_path, 'r') as file: | |
# formatted_dialogue = json.load(file) | |
# Generating the dialogue text | |
formatted_dialogue = segmented_transcript | |
dialogue_text = "" | |
for dialogue in formatted_dialogue: | |
# Converting start time to HH:MM:SS format | |
start_time = seconds_to_hms(dialogue['timestamp'][0]) | |
speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker") # Formatting speaker name | |
text = dialogue.get('text',"").strip() # Removing any leading/trailing whitespaces from the text | |
# Constructing each dialogue entry | |
dialogue_text += f"{start_time}, {speaker}: {text}\n\n" | |
# Checking the first part of the generated dialogue text | |
print("preview txt...") | |
print('---------------------------------\n') | |
print(dialogue_text[:500]) # Displaying the first 500 characters for review | |
# Save the dialogue text to a file | |
output_txt_file_path = file_path.replace('.json','.txt') | |
with open(output_txt_file_path, 'w',encoding="utf8") as file: | |
file.write(dialogue_text) | |
print( | |
f"Voila!β¨ Your file has been transcribed go check it out over here π {output_txt_file_path}" | |
) | |
return dialogue_text | |
# transcript function | |
model_name = "openai/whisper-large-v3" | |
flash = False # Set to True to use Flash Attention 2 | |
print('---------------------------------') | |
print('load pipe...') | |
print('---------------------------------') | |
# Initialize the pipeline | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model_name, | |
torch_dtype=torch.float16, | |
# low_cpu_mem_usage=True, | |
device='cuda:0', | |
model_kwargs={"use_flash_attention_2": flash}, | |
) | |
def transcript(file_path,pipe = pipe): | |
pattern = '\.mp4|\.wav|\.mp3' | |
transcript_path = re.sub(pattern,'.json',file_path) | |
device_id = "0" # or "mps" for Macs with Apple Silicon | |
device = "cuda" # or "mps" for Macs with Apple Silicon | |
task = "transcribe" # or "translate" | |
language = 'Chinese' # Whisper auto-detects the language | |
batch_size = 24 | |
timestamp = "chunk" # or "word" | |
diarization_model = "pyannote/speaker-diarization-3.1" | |
# Transcribe the audio | |
print('Transcribing...') | |
print('---------------------------------\n') | |
outputs = pipe( | |
file_path, | |
chunk_length_s=30, | |
batch_size=batch_size, | |
# generate_kwargs={"task": task, "language": language}, | |
generate_kwargs={"task": task}, | |
return_timestamps=True | |
) | |
# Save or display the output | |
print('Saving transcript...') | |
print('---------------------------------\n') | |
with open(transcript_path, "w", encoding="utf8") as fp: | |
json.dump(outputs, fp, ensure_ascii=False) | |
print( | |
f"Voila!β¨ Your file has been transcribed go check it out over here π {transcript_path}" | |
) | |
# save to transcript txt file | |
transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path) | |
transcript_txt_path = transcript_path.replace('.json','.txt') | |
# save to srt file | |
# Function to convert time in seconds to SRT time format | |
def convert_to_srt_time(timestamp): | |
hours = int(timestamp // 3600) | |
minutes = int((timestamp % 3600) // 60) | |
seconds = int(timestamp % 60) | |
milliseconds = int((timestamp % 1) * 1000) | |
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" | |
# Creating the SRT content | |
srt_content = "" | |
for index, entry in enumerate(outputs['chunks']): | |
try: | |
start_time = convert_to_srt_time(entry['timestamp'][0]) | |
end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1) | |
srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n" | |
except Exception as e: | |
print(e) | |
print(entry) | |
# Saving the SRT content to a file | |
srt_file_path = transcript_path.replace('.json','.srt') | |
# srt_file_path = '/kaggle/working/6-revolution_transcript.srt' | |
with open(srt_file_path, 'w',encoding="utf8") as file: | |
file.write(srt_content) | |
print( | |
f"Voila!β¨ Your file has been transcribed go check it out over here π {srt_file_path}" | |
) | |
return transcript_txt,srt_file_path | |
# youtube transcript function | |
def transcript_youtube(url): | |
# download youtube video | |
default_filename = youtube_download(url) | |
file_path = os.path.join(os.getcwd(),default_filename) | |
transcript_txt,srt_file_path = transcript(file_path) | |
return transcript_txt[:500],file_path,srt_file_path | |
# test youtube transcript | |
# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg" | |
# transcript_youtube(url) | |
# gradio interface | |
# !pip install --upgrade -q gradio | |
import gradio as gr | |
title = "Fastly audio transcript" | |
description = "Input your audio or record your audio" | |
def audio_func(audio_file): | |
return f"This is the audio file path: {audio_file}" | |
def file_func(file_path): | |
return f"This is the file path: {file_path}" | |
audio_input = gr.Audio(type='filepath') | |
file_input = gr.File(type="filepath") | |
youtube_interface = gr.Interface( | |
fn = transcript_youtube, | |
inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"), | |
outputs = [ | |
gr.Textbox(label="Transcript preview", lines=3), | |
gr.File(label="Download Video"), | |
gr.File(label="Srt file") | |
], | |
title = "Fastly Youtube Video Transcrip", | |
description = "Transcript Any Youtube video in Seconds!!!" | |
) | |
audio_interface = gr.Interface( | |
fn=audio_func, | |
inputs=audio_input, | |
outputs=[gr.Textbox(label="Greeting",lines=3)], | |
title = title, | |
description = description | |
) | |
file_interface = gr.Interface( | |
fn=file_func, | |
inputs=file_input, | |
outputs=[gr.Textbox(label="Greeting",lines=3)], | |
title = title, | |
description = description | |
) | |
demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"]) | |
demo.queue(max_size = 20) | |
demo.launch(share = True) | |