Spaces:

liyaoshi
/

Fast_Transcript_for_Everyone

Runtime error

File size: 7,357 Bytes


# install packages

# !pip install --upgrade -q ipython-autotime

# %load_ext autotime

# Download youtube video

# !pip install -q pytube

# youtube video download function

import os
from pytube import YouTube

def progress_function(stream, chunk, bytes_remaining):
    total_size = stream.filesize
    bytes_downloaded = total_size - bytes_remaining
    percentage_of_completion = bytes_downloaded / total_size * 100
    print(f"Downloaded {percentage_of_completion}%")

def youtube_download(video_url):
    yt = YouTube(video_url, on_progress_callback=progress_function)

    # get video title
    video_title = yt.title
    print(f"Downloading video: {video_title}")

    stream = yt.streams.get_highest_resolution()
    # get video default name
    default_filename = stream.default_filename
    stream.download()

    return default_filename

# use insanely-fast-whisper
# !pip install --upgrade -q transformers optimum accelerate pyannote.audio

import re
import json
import torch
from transformers import pipeline
from pyannote.audio import Pipeline

# transfer srt to plain text

import json

def seconds_to_hms(seconds):
    # Simple conversion of seconds to HH:MM:SS format
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

def transcript_json2txt(segmented_transcript,file_path):

#     with open(file_path, 'r') as file:
#         formatted_dialogue = json.load(file)
    # Generating the dialogue text
    formatted_dialogue = segmented_transcript
    dialogue_text = ""
    for dialogue in formatted_dialogue:
        # Converting start time to HH:MM:SS format
        start_time = seconds_to_hms(dialogue['timestamp'][0])
        speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker")  # Formatting speaker name
        text = dialogue.get('text',"").strip()  # Removing any leading/trailing whitespaces from the text

        # Constructing each dialogue entry
        dialogue_text += f"{start_time}, {speaker}: {text}\n\n"

    # Checking the first part of the generated dialogue text

    print("preview txt...")
    print('---------------------------------\n')
    print(dialogue_text[:500])  # Displaying the first 500 characters for review

    # Save the dialogue text to a file
    output_txt_file_path = file_path.replace('.json','.txt')
    with open(output_txt_file_path, 'w',encoding="utf8") as file:
        file.write(dialogue_text)
    print(
        f"Voila!✨ Your file has been transcribed go check it out over here 👉 {output_txt_file_path}"
    )
    return dialogue_text

# transcript function

model_name = "openai/whisper-large-v3"
flash = False  # Set to True to use Flash Attention 2
print('---------------------------------')
print('load pipe...')
print('---------------------------------')
# Initialize the pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
    device='cuda:0',
    model_kwargs={"use_flash_attention_2": flash},
)

def transcript(file_path,pipe = pipe):
  pattern = '\.mp4|\.wav|\.mp3'
  transcript_path = re.sub(pattern,'.json',file_path)
  device_id = "0"  # or "mps" for Macs with Apple Silicon
  device = "cuda" # or "mps" for Macs with Apple Silicon
  task = "transcribe"  # or "translate"
  language = 'Chinese'  # Whisper auto-detects the language
  batch_size = 24
  timestamp = "chunk"  # or "word"
  diarization_model = "pyannote/speaker-diarization-3.1"

  # Transcribe the audio
  print('Transcribing...')
  print('---------------------------------\n')

  outputs = pipe(
      file_path,
      chunk_length_s=30,
      batch_size=batch_size,
  #     generate_kwargs={"task": task, "language": language},
      generate_kwargs={"task": task},
      return_timestamps=True
  )

  # Save or display the output
  print('Saving transcript...')
  print('---------------------------------\n')


  with open(transcript_path, "w", encoding="utf8") as fp:
      json.dump(outputs, fp, ensure_ascii=False)

  print(
          f"Voila!✨ Your file has been transcribed go check it out over here 👉 {transcript_path}"
      )

  # save to transcript txt file
  transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path)
  transcript_txt_path = transcript_path.replace('.json','.txt')

  # save to srt file

  # Function to convert time in seconds to SRT time format
  def convert_to_srt_time(timestamp):
      hours = int(timestamp // 3600)
      minutes = int((timestamp % 3600) // 60)
      seconds = int(timestamp % 60)
      milliseconds = int((timestamp % 1) * 1000)
      return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

  # Creating the SRT content
  srt_content = ""
  for index, entry in enumerate(outputs['chunks']):
      try:
          start_time = convert_to_srt_time(entry['timestamp'][0])
          end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1)
          srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n"
      except Exception as e:
          print(e)
          print(entry)

  # Saving the SRT content to a file
  srt_file_path = transcript_path.replace('.json','.srt')
  # srt_file_path = '/kaggle/working/6-revolution_transcript.srt'
  with open(srt_file_path, 'w',encoding="utf8") as file:
      file.write(srt_content)

  print(
          f"Voila!✨ Your file has been transcribed go check it out over here 👉 {srt_file_path}"
      )
  return transcript_txt,srt_file_path

# youtube transcript function

def transcript_youtube(url):
  # download youtube video
  default_filename = youtube_download(url)
  file_path = os.path.join(os.getcwd(),default_filename)
  transcript_txt,srt_file_path = transcript(file_path)
  return transcript_txt[:500],file_path,srt_file_path

# test youtube transcript

# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg"
# transcript_youtube(url)

# gradio interface

# !pip install --upgrade -q gradio

import gradio as gr

title = "Fastly audio transcript"
description = "Input your audio or record your audio"


def audio_func(audio_file):
    return f"This is the audio file path: {audio_file}"
def file_func(file_path):
  return f"This is the file path: {file_path}"

audio_input = gr.Audio(type='filepath')
file_input = gr.File(type="filepath")

youtube_interface = gr.Interface(
    fn = transcript_youtube,
    inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"),
    outputs = [
        gr.Textbox(label="Transcript preview", lines=3),
        gr.File(label="Download Video"),
        gr.File(label="Srt file")
        ],
    title = "Fastly Youtube Video Transcrip",
    description = "Transcript Any Youtube video in Seconds!!!"
)


audio_interface = gr.Interface(
    fn=audio_func,
    inputs=audio_input,
    outputs=[gr.Textbox(label="Greeting",lines=3)],
    title = title,
    description = description
)

file_interface = gr.Interface(
    fn=file_func,
    inputs=file_input,
    outputs=[gr.Textbox(label="Greeting",lines=3)],
    title = title,
    description = description
)

demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"])
demo.queue(max_size = 20)

demo.launch(share = True)