liyaoshi's picture
Update app.py
2edff59 verified
# install packages
# !pip install --upgrade -q ipython-autotime
# %load_ext autotime
# Download youtube video
# !pip install -q pytube
# youtube video download function
import os
from pytube import YouTube
def progress_function(stream, chunk, bytes_remaining):
total_size = stream.filesize
bytes_downloaded = total_size - bytes_remaining
percentage_of_completion = bytes_downloaded / total_size * 100
print(f"Downloaded {percentage_of_completion}%")
def youtube_download(video_url):
yt = YouTube(video_url, on_progress_callback=progress_function)
# get video title
video_title = yt.title
print(f"Downloading video: {video_title}")
stream = yt.streams.get_highest_resolution()
# get video default name
default_filename = stream.default_filename
stream.download()
return default_filename
# use insanely-fast-whisper
# !pip install --upgrade -q transformers optimum accelerate pyannote.audio
import re
import json
import torch
from transformers import pipeline
from pyannote.audio import Pipeline
# transfer srt to plain text
import json
def seconds_to_hms(seconds):
# Simple conversion of seconds to HH:MM:SS format
hours, remainder = divmod(seconds, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"
def transcript_json2txt(segmented_transcript,file_path):
# with open(file_path, 'r') as file:
# formatted_dialogue = json.load(file)
# Generating the dialogue text
formatted_dialogue = segmented_transcript
dialogue_text = ""
for dialogue in formatted_dialogue:
# Converting start time to HH:MM:SS format
start_time = seconds_to_hms(dialogue['timestamp'][0])
speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker") # Formatting speaker name
text = dialogue.get('text',"").strip() # Removing any leading/trailing whitespaces from the text
# Constructing each dialogue entry
dialogue_text += f"{start_time}, {speaker}: {text}\n\n"
# Checking the first part of the generated dialogue text
print("preview txt...")
print('---------------------------------\n')
print(dialogue_text[:500]) # Displaying the first 500 characters for review
# Save the dialogue text to a file
output_txt_file_path = file_path.replace('.json','.txt')
with open(output_txt_file_path, 'w',encoding="utf8") as file:
file.write(dialogue_text)
print(
f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {output_txt_file_path}"
)
return dialogue_text
# transcript function
model_name = "openai/whisper-large-v3"
flash = False # Set to True to use Flash Attention 2
print('---------------------------------')
print('load pipe...')
print('---------------------------------')
# Initialize the pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model_name,
torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
device='cuda:0',
model_kwargs={"use_flash_attention_2": flash},
)
def transcript(file_path,pipe = pipe):
pattern = '\.mp4|\.wav|\.mp3'
transcript_path = re.sub(pattern,'.json',file_path)
device_id = "0" # or "mps" for Macs with Apple Silicon
device = "cuda" # or "mps" for Macs with Apple Silicon
task = "transcribe" # or "translate"
language = 'Chinese' # Whisper auto-detects the language
batch_size = 24
timestamp = "chunk" # or "word"
diarization_model = "pyannote/speaker-diarization-3.1"
# Transcribe the audio
print('Transcribing...')
print('---------------------------------\n')
outputs = pipe(
file_path,
chunk_length_s=30,
batch_size=batch_size,
# generate_kwargs={"task": task, "language": language},
generate_kwargs={"task": task},
return_timestamps=True
)
# Save or display the output
print('Saving transcript...')
print('---------------------------------\n')
with open(transcript_path, "w", encoding="utf8") as fp:
json.dump(outputs, fp, ensure_ascii=False)
print(
f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {transcript_path}"
)
# save to transcript txt file
transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path)
transcript_txt_path = transcript_path.replace('.json','.txt')
# save to srt file
# Function to convert time in seconds to SRT time format
def convert_to_srt_time(timestamp):
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
milliseconds = int((timestamp % 1) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
# Creating the SRT content
srt_content = ""
for index, entry in enumerate(outputs['chunks']):
try:
start_time = convert_to_srt_time(entry['timestamp'][0])
end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1)
srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n"
except Exception as e:
print(e)
print(entry)
# Saving the SRT content to a file
srt_file_path = transcript_path.replace('.json','.srt')
# srt_file_path = '/kaggle/working/6-revolution_transcript.srt'
with open(srt_file_path, 'w',encoding="utf8") as file:
file.write(srt_content)
print(
f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {srt_file_path}"
)
return transcript_txt,srt_file_path
# youtube transcript function
def transcript_youtube(url):
# download youtube video
default_filename = youtube_download(url)
file_path = os.path.join(os.getcwd(),default_filename)
transcript_txt,srt_file_path = transcript(file_path)
return transcript_txt[:500],file_path,srt_file_path
# test youtube transcript
# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg"
# transcript_youtube(url)
# gradio interface
# !pip install --upgrade -q gradio
import gradio as gr
title = "Fastly audio transcript"
description = "Input your audio or record your audio"
def audio_func(audio_file):
return f"This is the audio file path: {audio_file}"
def file_func(file_path):
return f"This is the file path: {file_path}"
audio_input = gr.Audio(type='filepath')
file_input = gr.File(type="filepath")
youtube_interface = gr.Interface(
fn = transcript_youtube,
inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"),
outputs = [
gr.Textbox(label="Transcript preview", lines=3),
gr.File(label="Download Video"),
gr.File(label="Srt file")
],
title = "Fastly Youtube Video Transcrip",
description = "Transcript Any Youtube video in Seconds!!!"
)
audio_interface = gr.Interface(
fn=audio_func,
inputs=audio_input,
outputs=[gr.Textbox(label="Greeting",lines=3)],
title = title,
description = description
)
file_interface = gr.Interface(
fn=file_func,
inputs=file_input,
outputs=[gr.Textbox(label="Greeting",lines=3)],
title = title,
description = description
)
demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"])
demo.queue(max_size = 20)
demo.launch(share = True)