File size: 7,357 Bytes
954b6ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8c18df
 
954b6ac
 
 
 
 
 
 
 
 
 
 
aa87202
954b6ac
 
 
 
 
 
 
 
 
 
 
aa87202
f8c18df
954b6ac
 
 
 
3e341d3
 
f8c18df
954b6ac
 
 
 
 
 
 
 
 
 
f8c18df
 
954b6ac
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242

# install packages

# !pip install --upgrade -q ipython-autotime

# %load_ext autotime

# Download youtube video

# !pip install -q pytube

# youtube video download function

import os
from pytube import YouTube

def progress_function(stream, chunk, bytes_remaining):
    total_size = stream.filesize
    bytes_downloaded = total_size - bytes_remaining
    percentage_of_completion = bytes_downloaded / total_size * 100
    print(f"Downloaded {percentage_of_completion}%")

def youtube_download(video_url):
    yt = YouTube(video_url, on_progress_callback=progress_function)

    # get video title
    video_title = yt.title
    print(f"Downloading video: {video_title}")

    stream = yt.streams.get_highest_resolution()
    # get video default name
    default_filename = stream.default_filename
    stream.download()

    return default_filename

# use insanely-fast-whisper
# !pip install --upgrade -q transformers optimum accelerate pyannote.audio

import re
import json
import torch
from transformers import pipeline
from pyannote.audio import Pipeline

# transfer srt to plain text

import json

def seconds_to_hms(seconds):
    # Simple conversion of seconds to HH:MM:SS format
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

def transcript_json2txt(segmented_transcript,file_path):

#     with open(file_path, 'r') as file:
#         formatted_dialogue = json.load(file)
    # Generating the dialogue text
    formatted_dialogue = segmented_transcript
    dialogue_text = ""
    for dialogue in formatted_dialogue:
        # Converting start time to HH:MM:SS format
        start_time = seconds_to_hms(dialogue['timestamp'][0])
        speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker")  # Formatting speaker name
        text = dialogue.get('text',"").strip()  # Removing any leading/trailing whitespaces from the text

        # Constructing each dialogue entry
        dialogue_text += f"{start_time}, {speaker}: {text}\n\n"

    # Checking the first part of the generated dialogue text

    print("preview txt...")
    print('---------------------------------\n')
    print(dialogue_text[:500])  # Displaying the first 500 characters for review

    # Save the dialogue text to a file
    output_txt_file_path = file_path.replace('.json','.txt')
    with open(output_txt_file_path, 'w',encoding="utf8") as file:
        file.write(dialogue_text)
    print(
        f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {output_txt_file_path}"
    )
    return dialogue_text

# transcript function

model_name = "openai/whisper-large-v3"
flash = False  # Set to True to use Flash Attention 2
print('---------------------------------')
print('load pipe...')
print('---------------------------------')
# Initialize the pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
    device='cuda:0',
    model_kwargs={"use_flash_attention_2": flash},
)

def transcript(file_path,pipe = pipe):
  pattern = '\.mp4|\.wav|\.mp3'
  transcript_path = re.sub(pattern,'.json',file_path)
  device_id = "0"  # or "mps" for Macs with Apple Silicon
  device = "cuda" # or "mps" for Macs with Apple Silicon
  task = "transcribe"  # or "translate"
  language = 'Chinese'  # Whisper auto-detects the language
  batch_size = 24
  timestamp = "chunk"  # or "word"
  diarization_model = "pyannote/speaker-diarization-3.1"

  # Transcribe the audio
  print('Transcribing...')
  print('---------------------------------\n')

  outputs = pipe(
      file_path,
      chunk_length_s=30,
      batch_size=batch_size,
  #     generate_kwargs={"task": task, "language": language},
      generate_kwargs={"task": task},
      return_timestamps=True
  )

  # Save or display the output
  print('Saving transcript...')
  print('---------------------------------\n')


  with open(transcript_path, "w", encoding="utf8") as fp:
      json.dump(outputs, fp, ensure_ascii=False)

  print(
          f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {transcript_path}"
      )

  # save to transcript txt file
  transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path)
  transcript_txt_path = transcript_path.replace('.json','.txt')

  # save to srt file

  # Function to convert time in seconds to SRT time format
  def convert_to_srt_time(timestamp):
      hours = int(timestamp // 3600)
      minutes = int((timestamp % 3600) // 60)
      seconds = int(timestamp % 60)
      milliseconds = int((timestamp % 1) * 1000)
      return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

  # Creating the SRT content
  srt_content = ""
  for index, entry in enumerate(outputs['chunks']):
      try:
          start_time = convert_to_srt_time(entry['timestamp'][0])
          end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1)
          srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n"
      except Exception as e:
          print(e)
          print(entry)

  # Saving the SRT content to a file
  srt_file_path = transcript_path.replace('.json','.srt')
  # srt_file_path = '/kaggle/working/6-revolution_transcript.srt'
  with open(srt_file_path, 'w',encoding="utf8") as file:
      file.write(srt_content)

  print(
          f"Voila!✨ Your file has been transcribed go check it out over here πŸ‘‰ {srt_file_path}"
      )
  return transcript_txt,srt_file_path

# youtube transcript function

def transcript_youtube(url):
  # download youtube video
  default_filename = youtube_download(url)
  file_path = os.path.join(os.getcwd(),default_filename)
  transcript_txt,srt_file_path = transcript(file_path)
  return transcript_txt[:500],file_path,srt_file_path

# test youtube transcript

# url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg"
# transcript_youtube(url)

# gradio interface

# !pip install --upgrade -q gradio

import gradio as gr

title = "Fastly audio transcript"
description = "Input your audio or record your audio"


def audio_func(audio_file):
    return f"This is the audio file path: {audio_file}"
def file_func(file_path):
  return f"This is the file path: {file_path}"

audio_input = gr.Audio(type='filepath')
file_input = gr.File(type="filepath")

youtube_interface = gr.Interface(
    fn = transcript_youtube,
    inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"),
    outputs = [
        gr.Textbox(label="Transcript preview", lines=3),
        gr.File(label="Download Video"),
        gr.File(label="Srt file")
        ],
    title = "Fastly Youtube Video Transcrip",
    description = "Transcript Any Youtube video in Seconds!!!"
)


audio_interface = gr.Interface(
    fn=audio_func,
    inputs=audio_input,
    outputs=[gr.Textbox(label="Greeting",lines=3)],
    title = title,
    description = description
)

file_interface = gr.Interface(
    fn=file_func,
    inputs=file_input,
    outputs=[gr.Textbox(label="Greeting",lines=3)],
    title = title,
    description = description
)

demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"])
demo.queue(max_size = 20)

demo.launch(share = True)