Spaces:

liyaoshi
/

Fast_Transcript_for_Everyone

Runtime error

App Files Files Community

Fast_Transcript_for_Everyone / app.py

liyaoshi

Update app.py

2edff59 verified about 1 year ago

raw

history blame contribute delete

7.36 kB


	# install packages

	# !pip install --upgrade -q ipython-autotime

	# %load_ext autotime

	# Download youtube video

	# !pip install -q pytube

	# youtube video download function

	import os
	from pytube import YouTube

	def progress_function(stream, chunk, bytes_remaining):
	total_size = stream.filesize
	bytes_downloaded = total_size - bytes_remaining
	percentage_of_completion = bytes_downloaded / total_size * 100
	print(f"Downloaded {percentage_of_completion}%")

	def youtube_download(video_url):
	yt = YouTube(video_url, on_progress_callback=progress_function)

	# get video title
	video_title = yt.title
	print(f"Downloading video: {video_title}")

	stream = yt.streams.get_highest_resolution()
	# get video default name
	default_filename = stream.default_filename
	stream.download()

	return default_filename

	# use insanely-fast-whisper
	# !pip install --upgrade -q transformers optimum accelerate pyannote.audio

	import re
	import json
	import torch
	from transformers import pipeline
	from pyannote.audio import Pipeline

	# transfer srt to plain text

	import json

	def seconds_to_hms(seconds):
	# Simple conversion of seconds to HH:MM:SS format
	hours, remainder = divmod(seconds, 3600)
	minutes, seconds = divmod(remainder, 60)
	return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

	def transcript_json2txt(segmented_transcript,file_path):

	# with open(file_path, 'r') as file:
	# formatted_dialogue = json.load(file)
	# Generating the dialogue text
	formatted_dialogue = segmented_transcript
	dialogue_text = ""
	for dialogue in formatted_dialogue:
	# Converting start time to HH:MM:SS format
	start_time = seconds_to_hms(dialogue['timestamp'][0])
	speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker") # Formatting speaker name
	text = dialogue.get('text',"").strip() # Removing any leading/trailing whitespaces from the text

	# Constructing each dialogue entry
	dialogue_text += f"{start_time}, {speaker}: {text}\n\n"

	# Checking the first part of the generated dialogue text

	print("preview txt...")
	print('---------------------------------\n')
	print(dialogue_text[:500]) # Displaying the first 500 characters for review

	# Save the dialogue text to a file
	output_txt_file_path = file_path.replace('.json','.txt')
	with open(output_txt_file_path, 'w',encoding="utf8") as file:
	file.write(dialogue_text)
	print(
	f"Voila!✨ Your file has been transcribed go check it out over here 👉 {output_txt_file_path}"
	)
	return dialogue_text

	# transcript function

	model_name = "openai/whisper-large-v3"
	flash = False # Set to True to use Flash Attention 2
	print('---------------------------------')
	print('load pipe...')
	print('---------------------------------')
	# Initialize the pipeline
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_name,
	torch_dtype=torch.float16,
	# low_cpu_mem_usage=True,
	device='cuda:0',
	model_kwargs={"use_flash_attention_2": flash},
	)

	def transcript(file_path,pipe = pipe):
	pattern = '\.mp4\|\.wav\|\.mp3'
	transcript_path = re.sub(pattern,'.json',file_path)
	device_id = "0" # or "mps" for Macs with Apple Silicon
	device = "cuda" # or "mps" for Macs with Apple Silicon
	task = "transcribe" # or "translate"
	language = 'Chinese' # Whisper auto-detects the language
	batch_size = 24
	timestamp = "chunk" # or "word"
	diarization_model = "pyannote/speaker-diarization-3.1"

	# Transcribe the audio
	print('Transcribing...')
	print('---------------------------------\n')

	outputs = pipe(
	file_path,
	chunk_length_s=30,
	batch_size=batch_size,
	# generate_kwargs={"task": task, "language": language},
	generate_kwargs={"task": task},
	return_timestamps=True
	)

	# Save or display the output
	print('Saving transcript...')
	print('---------------------------------\n')


	with open(transcript_path, "w", encoding="utf8") as fp:
	json.dump(outputs, fp, ensure_ascii=False)

	print(
	f"Voila!✨ Your file has been transcribed go check it out over here 👉 {transcript_path}"
	)

	# save to transcript txt file
	transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path)
	transcript_txt_path = transcript_path.replace('.json','.txt')

	# save to srt file

	# Function to convert time in seconds to SRT time format
	def convert_to_srt_time(timestamp):
	hours = int(timestamp // 3600)
	minutes = int((timestamp % 3600) // 60)
	seconds = int(timestamp % 60)
	milliseconds = int((timestamp % 1) * 1000)
	return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

	# Creating the SRT content
	srt_content = ""
	for index, entry in enumerate(outputs['chunks']):
	try:
	start_time = convert_to_srt_time(entry['timestamp'][0])
	end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1)
	srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n"
	except Exception as e:
	print(e)
	print(entry)

	# Saving the SRT content to a file
	srt_file_path = transcript_path.replace('.json','.srt')
	# srt_file_path = '/kaggle/working/6-revolution_transcript.srt'
	with open(srt_file_path, 'w',encoding="utf8") as file:
	file.write(srt_content)

	print(
	f"Voila!✨ Your file has been transcribed go check it out over here 👉 {srt_file_path}"
	)
	return transcript_txt,srt_file_path

	# youtube transcript function

	def transcript_youtube(url):
	# download youtube video
	default_filename = youtube_download(url)
	file_path = os.path.join(os.getcwd(),default_filename)
	transcript_txt,srt_file_path = transcript(file_path)
	return transcript_txt[:500],file_path,srt_file_path

	# test youtube transcript

	# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg"
	# transcript_youtube(url)

	# gradio interface

	# !pip install --upgrade -q gradio

	import gradio as gr

	title = "Fastly audio transcript"
	description = "Input your audio or record your audio"


	def audio_func(audio_file):
	return f"This is the audio file path: {audio_file}"
	def file_func(file_path):
	return f"This is the file path: {file_path}"

	audio_input = gr.Audio(type='filepath')
	file_input = gr.File(type="filepath")

	youtube_interface = gr.Interface(
	fn = transcript_youtube,
	inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"),
	outputs = [
	gr.Textbox(label="Transcript preview", lines=3),
	gr.File(label="Download Video"),
	gr.File(label="Srt file")
	],
	title = "Fastly Youtube Video Transcrip",
	description = "Transcript Any Youtube video in Seconds!!!"
	)


	audio_interface = gr.Interface(
	fn=audio_func,
	inputs=audio_input,
	outputs=[gr.Textbox(label="Greeting",lines=3)],
	title = title,
	description = description
	)

	file_interface = gr.Interface(
	fn=file_func,
	inputs=file_input,
	outputs=[gr.Textbox(label="Greeting",lines=3)],
	title = title,
	description = description
	)

	demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"])
	demo.queue(max_size = 20)

	demo.launch(share = True)