Spaces:

liyaoshi
/

Fast_Transcript_for_Everyone

Runtime error

App Files Files Community

liyaoshi commited on Apr 28, 2024

Commit

954b6ac

verified ·

1 Parent(s): 79af996

Update app.py

Browse files

Files changed (1) hide show

app.py +238 -9

app.py CHANGED Viewed

@@ -1,19 +1,248 @@
 import gradio as gr
-title = "Space demo for gradio: Greeting"
-description = "Input your name"
-def greet(name, intensity):
-    return "Hello, " + name + "!" * int(intensity)
-demo = gr.Interface(
-    fn=greet,
-    inputs=["text", "slider"],
-    outputs=["text"],
     title = title,
     description = description
 )
 demo.queue(max_size = 20)
-demo.launch()

+# -*- coding: utf-8 -*-
+"""audio_transcript.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1XRGgJvXg3QOPl2XecLjkwngRQRsv0g-6
+# install packages
+# !pip install --upgrade -q ipython-autotime
+# %load_ext autotime
+# Download youtube video
+# !pip install -q pytube
+# youtube video download function
+import os
+from pytube import YouTube
+def progress_function(stream, chunk, bytes_remaining):
+    total_size = stream.filesize
+    bytes_downloaded = total_size - bytes_remaining
+    percentage_of_completion = bytes_downloaded / total_size * 100
+    print(f"Downloaded {percentage_of_completion}%")
+def youtube_download(video_url):
+    yt = YouTube(video_url, on_progress_callback=progress_function)
+    # get video title
+    video_title = yt.title
+    print(f"Downloading video: {video_title}")
+    stream = yt.streams.get_highest_resolution()
+    # get video default name
+    default_filename = stream.default_filename
+    stream.download()
+    return default_filename
+# use insanely-fast-whisper
+# !pip install --upgrade -q transformers optimum accelerate pyannote.audio
+import re
+import json
+import torch
+from transformers import pipeline
+from pyannote.audio import Pipeline
+# transfer srt to plain text
+import json
+def seconds_to_hms(seconds):
+    # Simple conversion of seconds to HH:MM:SS format
+    hours, remainder = divmod(seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"
+def transcript_json2txt(segmented_transcript,file_path):
+#     with open(file_path, 'r') as file:
+#         formatted_dialogue = json.load(file)
+    # Generating the dialogue text
+    formatted_dialogue = segmented_transcript
+    dialogue_text = ""
+    for dialogue in formatted_dialogue:
+        # Converting start time to HH:MM:SS format
+        start_time = seconds_to_hms(dialogue['timestamp'][0])
+        speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker")  # Formatting speaker name
+        text = dialogue.get('text',"").strip()  # Removing any leading/trailing whitespaces from the text
+        # Constructing each dialogue entry
+        dialogue_text += f"{start_time}, {speaker}: {text}\n\n"
+    # Checking the first part of the generated dialogue text
+    print("preview txt...")
+    print('---------------------------------\n')
+    print(dialogue_text[:500])  # Displaying the first 500 characters for review
+    # Save the dialogue text to a file
+    output_txt_file_path = file_path.replace('.json','.txt')
+    with open(output_txt_file_path, 'w',encoding="utf8") as file:
+        file.write(dialogue_text)
+    print(
+        f"Voila!✨ Your file has been transcribed go check it out over here 👉 {output_txt_file_path}"
+    )
+    return dialogue_text
+# transcript function
+model_name = "openai/whisper-large-v3"
+flash = False  # Set to True to use Flash Attention 2
+print('---------------------------------')
+print('load pipe...')
+print('---------------------------------')
+# Initialize the pipeline
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model_name,
+    torch_dtype=torch.float16,
+#     low_cpu_mem_usage=True,
+    device='cuda:0',
+    model_kwargs={"use_flash_attention_2": flash},
+)
+def transcript(file_path,pipe = pipe):
+  pattern = '\.mp4|\.wav|\.mp3'
+  transcript_path = re.sub(pattern,'.json',file_path)
+  device_id = "0"  # or "mps" for Macs with Apple Silicon
+  device = "cuda" # or "mps" for Macs with Apple Silicon
+  task = "transcribe"  # or "translate"
+  language = 'Chinese'  # Whisper auto-detects the language
+  batch_size = 24
+  timestamp = "chunk"  # or "word"
+  diarization_model = "pyannote/speaker-diarization-3.1"
+  # Transcribe the audio
+  print('Transcribing...')
+  print('---------------------------------\n')
+  outputs = pipe(
+      file_path,
+      chunk_length_s=30,
+      batch_size=batch_size,
+  #     generate_kwargs={"task": task, "language": language},
+      generate_kwargs={"task": task},
+      return_timestamps=True
+  )
+  # Save or display the output
+  print('Saving transcript...')
+  print('---------------------------------\n')
+  with open(transcript_path, "w", encoding="utf8") as fp:
+      json.dump(outputs, fp, ensure_ascii=False)
+  print(
+          f"Voila!✨ Your file has been transcribed go check it out over here 👉 {transcript_path}"
+      )
+  # save to transcript txt file
+  transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path)
+  transcript_txt_path = transcript_path.replace('.json','.txt')
+  # save to srt file
+  # Function to convert time in seconds to SRT time format
+  def convert_to_srt_time(timestamp):
+      hours = int(timestamp // 3600)
+      minutes = int((timestamp % 3600) // 60)
+      seconds = int(timestamp % 60)
+      milliseconds = int((timestamp % 1) * 1000)
+      return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+  # Creating the SRT content
+  srt_content = ""
+  for index, entry in enumerate(outputs['chunks']):
+      try:
+          start_time = convert_to_srt_time(entry['timestamp'][0])
+          end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1)
+          srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n"
+      except Exception as e:
+          print(e)
+          print(entry)
+  # Saving the SRT content to a file
+  srt_file_path = transcript_path.replace('.json','.srt')
+  # srt_file_path = '/kaggle/working/6-revolution_transcript.srt'
+  with open(srt_file_path, 'w',encoding="utf8") as file:
+      file.write(srt_content)
+  print(
+          f"Voila!✨ Your file has been transcribed go check it out over here 👉 {srt_file_path}"
+      )
+  return transcript_txt,srt_file_path
+# youtube transcript function
+def transcript_youtube(url):
+  # download youtube video
+  default_filename = youtube_download(url)
+  file_path = os.path.join(os.getcwd(),default_filename)
+  transcript_txt,srt_file_path = transcript(file_path)
+  return transcript_txt[:500],file_path,srt_file_path
+# test youtube transcript
+# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg"
+# transcript_youtube(url)
+# gradio interface
+# !pip install --upgrade -q gradio
 import gradio as gr
+title = "Fastly audio transcript"
+description = "Input your audio or record your audio"
+def audio_func(audio_file):
+    return f"This is the audio file path: {audio_file}"
+def file_func(file_path):
+  return f"This is the file path: {file_path}"
+audio_input = gr.Audio(type='filepath')
+file_input = gr.File(type="filepath")
+youtube_interface = gr.Interface(
+    fn = transcript_youtube,
+    inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"),
+    outputs = [
+        gr.Textbox(label="Transcript preview", lines=3),
+        gr.File(label="Download Video"),
+        gr.File(label="Srt file")
+        ],
+    title = "Fastly Youtube Video Transcrip",
+    description = "Transcript Any Youtube video in Seconds!!!"
+)
+audio_interface = gr.Interface(
+    fn=audio_func,
+    inputs=audio_input,
+    outputs=[gr.Textbox(label="Greeting",lines=3)],
     title = title,
     description = description
 )
+file_interface = gr.Interface(
+    fn=file_func,
+    inputs=file_input,
+    outputs=[gr.Textbox(label="Greeting",lines=3)],
+    title = title,
+    description = description
+)
+demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"])
 demo.queue(max_size = 20)
+demo.launch(share = True)