File size: 2,684 Bytes
26089e3
 
 
 
 
ab8cb27
 
26089e3
 
 
 
 
 
 
 
 
 
 
 
8a22c32
26089e3
 
 
8a22c32
26089e3
 
 
8a22c32
 
 
26089e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a22c32
26089e3
 
 
 
 
 
 
 
 
255c20e
26089e3
abf2ad0
26089e3
 
 
 
 
 
 
54f8344
 
 
8a22c32
54f8344
26089e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import whisper
import multiprocessing
import os
from pydub import AudioSegment
from typing import List
import gradio as gr


model = whisper.load_model("base")
def convert_to_text(audio_path: str) -> str:
  # Load the model outside the function if possible, so it's only loaded once
  model = whisper.load_model("base")
  
  # Split the audio into segments/chunks
  chunk_size = 30  # Length of each segment in seconds
  audio_segments = split_audio(audio_path, chunk_size)
  
  
  # Process segments in parallel using multiprocessing
  pool = multiprocessing.Pool()
  print("Starting the processes....")
  results = pool.map(process_segment, audio_segments)
  pool.close()
  pool.join() 
  
  
  # Combine the results
  text = ' '.join(results)
  return text
    
import os
from pydub import AudioSegment

def split_audio(audio_path: str, chunk_size: int) -> List[str]:
    # Create a directory to store the segmented audio files
    output_dir = "segmented_audio"
    os.makedirs(output_dir, exist_ok=True)

    # Open the audio file using pydub
    audio = AudioSegment.from_file(audio_path)

    # Calculate the number of chunks
    duration = len(audio) / 1000  # Convert to seconds
    num_chunks = int(duration / chunk_size)

    print(f"Chunk : Duration : {duration} : Number : {num_chunks}")

    # Split the audio into chunks
    audio_segments = []
    for i in range(num_chunks):
        start_time = i * chunk_size * 1000  # Convert to milliseconds
        end_time = (i + 1) * chunk_size * 1000

        # Extract the chunk from the audio file
        chunk = audio[start_time:end_time]

        # Create a temporary file to store the chunk
        chunk_path = os.path.join(output_dir, f"chunk_{i}.wav")
        chunk.export(chunk_path, format="wav")

        print(f"Chunk number {i} path : {chunk_path}")
        audio_segments.append(chunk_path)
      
    print(f"Audio split into : {len(audio_segments)}")

    return audio_segments


def process_segment(segment_path: str) -> str:
    # Load the model for each process if necessary
    
    print(f"Processing segment : {segment_path}")
    
    # Process the segment and return the transcribed text
    result = model.transcribe(segment_path)
    print(result['text'])
    return result["text"]

def get_results(path):
  
  #path = '/The genius of Satya Nadella  Sam Altman and Lex Fridman.mp3'

  seg = convert_to_text(path)
  q = multiprocessing.Queue()
  p = multiprocessing.Process(target=process_segment, args=(seg,q))
  p.start()
  print(q.get())
  p.join()
  return "complete"

ad = gr.components.Audio(type='filepath')
iface = gr.Interface(fn=convert_to_text, inputs=ad, outputs="text")
iface.launch()