Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 10, 2024

Commit

7380009

1 Parent(s): 8e3c59e

trying to build simple trasncribe - testing

Browse files

Files changed (4) hide show

Dockerfile +21 -14
infer.py +97 -120
requirements.txt +5 -5
whisper_online.py +0 -687

Dockerfile CHANGED Viewed

@@ -1,20 +1,27 @@
-# Include Python
-from python:3.11.1-buster
-# Define your working directory
-WORKDIR /
-# Install runpod
-RUN pip install runpod
-RUN pip install torch==2.3.1
-RUN pip install faster-whisper
-RUN python3 -c 'import faster_whisper; m = faster_whisper.WhisperModel("ivrit-ai/faster-whisper-v2-d4")'
-# Add your file
-ADD infer.py .
-ENV LD_LIBRARY_PATH="/usr/local/lib/python3.11/site-packages/nvidia/cudnn/lib:/usr/local/lib/python3.11/site-packages/nvidia/cublas/lib"
-# Call your file when your container starts
-CMD [ "python", "-u", "/infer.py" ]

+# Use an official Python runtime as a base image
+FROM python:3.9-slim
+# Set the working directory
+WORKDIR /app
+# Install system dependencies for soundfile and any other audio-related processing
+RUN apt-get update && \
+    apt-get install -y libsndfile1 && \
+    rm -rf /var/lib/apt/lists/*
+# Install dependencies for Hugging Face Spaces (git for model fetching)
+RUN apt-get install -y git
+# Copy the requirements.txt file and install the dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the current directory contents into the container at /app
+COPY . .
+# Hugging Face Spaces will expose port 7860 by default for web applications
+EXPOSE 7860
+# Command to run the transcription script or API server on Hugging Face
+CMD ["uvicorn", "infer:app", "--host", "0.0.0.0", "--port", "7860"]

infer.py CHANGED Viewed

@@ -1,131 +1,108 @@
-import base64
-import faster_whisper
-import tempfile
 import torch
-import requests
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# Load the model from Hugging Face
-model_name = 'ivrit-ai/faster-whisper-v2-d4'
-model = faster_whisper.WhisperModel(model_name, device=device)
-# Maximum data size: 200MB
-MAX_PAYLOAD_SIZE = 200 * 1024 * 1024
-def download_file(url, max_size_bytes, output_filename, api_key=None):
-    """
-    Download a file from a given URL with size limit and optional API key.
-    Args:
-    url (str): The URL of the file to download.
-    max_size_bytes (int): Maximum allowed file size in bytes.
-    output_filename (str): The name of the file to save the download as.
-    api_key (str, optional): API key to be used as a bearer token.
-    Returns:
-    bool: True if download was successful, False otherwise.
-    """
     try:
-        headers = {}
-        if api_key:
-            headers['Authorization'] = f'Bearer {api_key}'
-        response = requests.get(url, stream=True, headers=headers)
-        response.raise_for_status()
-        file_size = int(response.headers.get('Content-Length', 0))
-        if file_size > max_size_bytes:
-            print(f"File size ({file_size} bytes) exceeds the maximum allowed size ({max_size_bytes} bytes).")
-            return False
-        downloaded_size = 0
-        with open(output_filename, 'wb') as file:
-            for chunk in response.iter_content(chunk_size=8192):
-                downloaded_size += len(chunk)
-                if downloaded_size > max_size_bytes:
-                    print(f"Download stopped: Size limit exceeded ({max_size_bytes} bytes).")
-                    return False
-                file.write(chunk)
-        print(f"File downloaded successfully: {output_filename}")
-        return True
-    except requests.RequestException as e:
-        print(f"Error downloading file: {e}")
-        return False
-def transcribe(job):
-    datatype = job['input'].get('type', None)
-    if not datatype:
-        return {"error": "datatype field not provided. Should be 'blob' or 'url'."}
-    if datatype not in ['blob', 'url']:
-        return {"error": f"datatype should be 'blob' or 'url', but is {datatype} instead."}
-    api_key = job['input'].get('api_key', None)
-    with tempfile.TemporaryDirectory() as d:
-        audio_file = f'{d}/audio.mp3'
-        if datatype == 'blob':
-            mp3_bytes = base64.b64decode(job['input']['data'])
-            with open(audio_file, 'wb') as file:
-                file.write(mp3_bytes)
-        elif datatype == 'url':
-            success = download_file(job['input']['url'], MAX_PAYLOAD_SIZE, audio_file, api_key)
-            if not success:
-                return {"error": f"Error downloading data from {job['input']['url']}"}
-        result = transcribe_core(audio_file)
-        return {'result': result}
-def transcribe_core(audio_file):
-    print('Transcribing...')
-    ret = {'segments': []}
-    segs, _ = model.transcribe(audio_file, language='he', word_timestamps=True)
-    for s in segs:
-        words = []
-        for w in s.words:
-            words.append({
-                'start': w.start,
-                'end': w.end,
-                'word': w.word,
-                'probability': w.probability
-            })
-        seg = {
-            'id': s.id,
-            'seek': s.seek,
-            'start': s.start,
-            'end': s.end,
-            'text': s.text,
-            'avg_logprob': s.avg_logprob,
-            'compression_ratio': s.compression_ratio,
-            'no_speech_prob': s.no_speech_prob,
-            'words': words
-        }
-        print(seg)
-        ret['segments'].append(seg)
-    return ret
-# The script can be run directly or served using Hugging Face's Gradio app or API
 if __name__ == "__main__":
-    # For testing purposes, you can define a sample job and call the transcribe function
-    test_job = {
-        "input": {
-            "type": "url",
-            "url": "https://github.com/metaldaniel/HebrewASR-Comparison/raw/main/HaTankistiot_n12-mp3.mp3",
-            "api_key": "your_api_key_here"  # Optional, replace with actual key if needed
-        }
-    }
-    print(transcribe(test_job))

 import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import soundfile as sf
+from fastapi import FastAPI, File, UploadFile
+import uvicorn
+import os
+import logging
+from datetime import datetime
+# Set up logging
+logging.basicConfig(
+    filename="transcription_log.log",
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO
+)
+# Initialize FastAPI app
+app = FastAPI()
+# Log initialization of the application
+logging.info("FastAPI application started.")
+# Load the Whisper model and processor
+model_name = "openai/whisper-base"
+logging.info(f"Loading Whisper model: {model_name}")
+try:
+    processor = WhisperProcessor.from_pretrained(model_name)
+    model = WhisperForConditionalGeneration.from_pretrained(model_name)
+    logging.info(f"Model {model_name} successfully loaded.")
+except Exception as e:
+    logging.error(f"Error loading the model: {e}")
+    raise e
+# Move model to the appropriate device (GPU if available)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+logging.info(f"Model is using device: {device}")
+@app.post("/transcribe/")
+async def transcribe_audio(file: UploadFile = File(...)):
+    # Log file upload start
+    logging.info(f"Received audio file: {file.filename}")
+    start_time = datetime.now()
+    # Save the uploaded file
+    file_location = f"temp_{file.filename}"
     try:
+        with open(file_location, "wb+") as f:
+            f.write(await file.read())
+        logging.info(f"File saved to: {file_location}")
+    except Exception as e:
+        logging.error(f"Error saving the file: {e}")
+        return {"error": f"Error saving the file: {e}"}
+    # Load the audio file and preprocess it
+    try:
+        audio_input, _ = sf.read(file_location)
+        logging.info(f"Audio file {file.filename} successfully read.")
+        inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
+        logging.info(f"Audio file preprocessed for transcription.")
+    except Exception as e:
+        logging.error(f"Error processing the audio file: {e}")
+        return {"error": f"Error processing the audio file: {e}"}
+    # Move inputs to the same device as the model
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    logging.info("Inputs moved to the appropriate device.")
+    # Generate the transcription
+    try:
+        with torch.no_grad():
+            predicted_ids = model.generate(inputs["input_features"])
+        logging.info("Transcription successfully generated.")
+    except Exception as e:
+        logging.error(f"Error during transcription generation: {e}")
+        return {"error": f"Error during transcription generation: {e}"}
+    # Decode the transcription
+    try:
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        logging.info("Transcription successfully decoded.")
+    except Exception as e:
+        logging.error(f"Error decoding the transcription: {e}")
+        return {"error": f"Error decoding the transcription: {e}"}
+    # Clean up the temporary file
+    try:
+        os.remove(file_location)
+        logging.info(f"Temporary file {file_location} deleted.")
+    except Exception as e:
+        logging.error(f"Error deleting the temporary file: {e}")
+    end_time = datetime.now()
+    time_taken = end_time - start_time
+    logging.info(f"Transcription completed in {time_taken.total_seconds()} seconds.")
+    return {"transcription": transcription, "processing_time_seconds": time_taken.total_seconds()}
 if __name__ == "__main__":
+    # Log application start
+    logging.info("Starting FastAPI server with Uvicorn...")
+    # Run the FastAPI app on the default port (7860)
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-numpy
-librosa
-runpod
-faster-whisper
-torch==2.3.1

+fastapi
+uvicorn
+torch
+transformers
+soundfile

whisper_online.py DELETED Viewed

@@ -1,687 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import numpy as np
-import librosa
-from functools import lru_cache
-import time
-import logging
-import runpod
-import base64
-import io
-import soundfile as sf
-import math
-import os
-from dotenv import load_dotenv
-import openai
-#from voice_activity_controller import *
-load_dotenv('.env')
-OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-RUN_POD_API_KEY = os.getenv('RUN_POD_API_KEY')
-RUNPOD_ENDPOINT_ID = os.getenv('RUNPOD_ENDPOINT_ID')
-openai.api_key = OPENAI_API_KEY
-# Set up basic configuration for logging
-logging.basicConfig(
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    level=logging.DEBUG  # Set to DEBUG to capture all levels of log messages
-)
-# Use the root logger directly
-log = logging.getLogger(__name__)
-if not OPENAI_API_KEY:
-    log.error("API key not found. Please set the OPENAI_API_KEY environment variable.")
-    sys.exit(1)
-log.debug(f"Using API Key: {OPENAI_API_KEY[:5]}...")
-from faster_whisper import WhisperModel
-logger = logging.getLogger(__name__)
-@lru_cache
-def load_audio(fname):
-    a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
-    return a
-def load_audio_chunk(fname, beg, end):
-    audio = load_audio(fname)
-    beg_s = int(beg * 16000)
-    end_s = int(end * 16000)
-    return audio[beg_s:end_s]
-# Whisper backend
-class ASRBase:
-    sep = " "  # join transcribe words with this character (" " for whisper_timestamped,
-    # "" for faster-whisper because it emits the spaces when neeeded)
-    def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
-        self.logfile = logfile
-        self.transcribe_kargs = {}
-        if lan == "auto":
-            self.original_language = None
-        else:
-            self.original_language = lan
-        self.model = self.load_model(modelsize, cache_dir, model_dir)
-    def load_model(self, modelsize, cache_dir):
-        raise NotImplemented("must be implemented in the child class")
-    def transcribe(self, audio, init_prompt=""):
-        raise NotImplemented("must be implemented in the child class")
-    def use_vad(self):
-        raise NotImplemented("must be implemented in the child class")
-class IvritOnRunPodASR(ASRBase):
-    """Uses ivrit-ai API for audio transcription."""
-    def __init__(self, lan=None, api_key=None, endpoint_id=None, logfile=sys.stderr):
-        self.logfile = logfile
-        self.original_language = None if lan == "auto" else lan  # ISO-639-1 language code
-        if api_key is None or endpoint_id is None:
-            raise ValueError("API key and Endpoint ID must be provided for Runpod API")
-        runpod.api_key = api_key
-        self.endpoint = runpod.Endpoint(endpoint_id)
-        self.transcribed_seconds = 0  # For logging how many seconds were processed by API, to know the cost
-        self.use_vad_opt = False
-    def ts_words(self, segments):
-        if not segments:  # Check if segments is empty
-            logger.warning("No segments found in the response.")
-            return []
-        no_speech_segments = []
-        if self.use_vad_opt:
-            for segment in segments:
-                if segment["no_speech_prob"] > 0.8:
-                    no_speech_segments.append((segment.get("start"), segment.get("end")))
-        o = []
-        for segment in segments:
-            # Checking if 'word' is part of the segment and then processing it
-            start = segment.get("start")
-            end = segment.get("end")
-            text = segment.get("text", "")  # Assuming each segment is a dictionary with a 'word' key
-            if text and not any(s[0] <= start <= s[1] for s in no_speech_segments):
-                o.append((start, end, text))
-        return o
-    def segments_end_ts(self, res):
-        return [s["end"] for s in res]
-    def transcribe(self, audio_data, prompt=None, *args, **kwargs):
-        # Write the audio data to a buffer
-        buffer = io.BytesIO()
-        buffer.name = "temp.wav"
-        sf.write(buffer, audio_data, samplerate=16000, format='WAV', subtype='PCM_16')
-        buffer.seek(0)  # Reset buffer's position to the beginning
-        self.transcribed_seconds += math.ceil(len(audio_data) / 16000)  # it rounds up to the whole seconds
-        # Convert the audio to base64
-        audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
-        payload = {
-            'type': 'blob',
-            'data': audio_base64
-        }
-        try:
-            # Send the request to Runpod API
-            res = self.endpoint.run_sync(payload)
-            # res['result']
-            # logger.debug(f"Transcription response: {res}")  # Debugging line ##THIS CAUSES TO OUTPUT THE JUNK
-        except Exception as e:
-            logger.error(f"Failed to transcribe audio with Runpod API: {e}")
-            return None
-        segments = res.get('result', {}).get('segments', [])
-        return segments
-    def use_vad(self):
-        self.use_vad_opt = False
-    def set_translate_task(self):
-        self.task = "translate"
-class HypothesisBuffer:
-    def __init__(self, logfile=sys.stderr):
-        self.commited_in_buffer = []
-        self.buffer = []
-        self.new = []
-        self.last_commited_time = 0
-        self.last_commited_word = None
-        self.logfile = logfile
-    def insert(self, new, offset):
-        # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
-        # the new tail is added to self.new
-        new = [(a + offset, b + offset, t) for a, b, t in new]
-        self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
-        if len(self.new) >= 1:
-            a, b, t = self.new[0]
-            if abs(a - self.last_commited_time) < 1:
-                if self.commited_in_buffer:
-                    # it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped.
-                    cn = len(self.commited_in_buffer)
-                    nn = len(self.new)
-                    for i in range(1, min(min(cn, nn), 5) + 1):  # 5 is the maximum
-                        c = " ".join([self.commited_in_buffer[-j][2] for j in range(1, i + 1)][::-1])
-                        tail = " ".join(self.new[j - 1][2] for j in range(1, i + 1))
-                        if c == tail:
-                            words = []
-                            for j in range(i):
-                                words.append(repr(self.new.pop(0)))
-                            words_msg = " ".join(words)
-                            logger.debug(f"removing last {i} words: {words_msg}")
-                            break
-    def flush(self):
-        # returns commited chunk = the longest common prefix of 2 last inserts.
-        commit = []
-        while self.new:
-            na, nb, nt = self.new[0]
-            if len(self.buffer) == 0:
-                break
-            if nt == self.buffer[0][2]:
-                commit.append((na, nb, nt))
-                self.last_commited_word = nt
-                self.last_commited_time = nb
-                self.buffer.pop(0)
-                self.new.pop(0)
-            else:
-                break
-        self.buffer = self.new
-        self.new = []
-        self.commited_in_buffer.extend(commit)
-        return commit
-    def pop_commited(self, time):
-        while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time:
-            self.commited_in_buffer.pop(0)
-    def complete(self):
-        return self.buffer
-class OnlineASRProcessor:
-    SAMPLING_RATE = 16000
-    def __init__(self, asr, tokenizer=None, buffer_trimming=("segment", 15), logfile=sys.stderr):
-        """asr: WhisperASR object
-        tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
-        ("segment", 15)
-        buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
-        logfile: where to store the log.
-        """
-        self.asr = asr
-        self.tokenizer = tokenizer
-        self.logfile = logfile
-        self.init()
-        self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
-    def init(self, offset=None):
-        """run this when starting or restarting processing"""
-        self.audio_buffer = np.array([], dtype=np.float32)
-        self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
-        self.buffer_time_offset = 0
-        if offset is not None:
-            self.buffer_time_offset = offset
-        self.transcript_buffer.last_commited_time = self.buffer_time_offset
-        self.commited = []
-    def insert_audio_chunk(self, audio):
-        self.audio_buffer = np.append(self.audio_buffer, audio)
-    def prompt(self):
-        """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
-        "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
-        """
-        k = max(0, len(self.commited) - 1)
-        while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset:
-            k -= 1
-        p = self.commited[:k]
-        p = [t for _, _, t in p]
-        prompt = []
-        l = 0
-        while p and l < 200:  # 200 characters prompt size
-            x = p.pop(-1)
-            l += len(x) + 1
-            prompt.append(x)
-        non_prompt = self.commited[k:]
-        return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(t for _, _, t in non_prompt)
-    def process_iter(self):
-        """Runs on the current audio buffer.
-        Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
-        The non-emty text is confirmed (committed) partial transcript.
-        """
-        prompt, non_prompt = self.prompt()
-        logger.debug(f"PROMPT: {prompt}")
-        logger.debug(f"CONTEXT: {non_prompt}")
-        logger.debug(
-            f"transcribing {len(self.audio_buffer) / self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
-        res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
-        # transform to [(beg,end,"word1"), ...]
-        tsw = self.asr.ts_words(res)
-        self.transcript_buffer.insert(tsw, self.buffer_time_offset)
-        o = self.transcript_buffer.flush()
-        self.commited.extend(o)
-        completed = self.to_flush(o)
-        logger.debug(f">>>>COMPLETE NOW: {completed}")
-        the_rest = self.to_flush(self.transcript_buffer.complete())
-        logger.debug(f"INCOMPLETE: {the_rest}")
-        # there is a newly confirmed text
-        if o and self.buffer_trimming_way == "sentence":  # trim the completed sentences
-            if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec:  # longer than this
-                self.chunk_completed_sentence()
-        if self.buffer_trimming_way == "segment":
-            s = self.buffer_trimming_sec  # trim the completed segments longer than s,
-        else:
-            s = 30  # if the audio buffer is longer than 30s, trim it
-        if len(self.audio_buffer) / self.SAMPLING_RATE > s:
-            self.chunk_completed_segment(res)
-            # #alternative: on any word
-            # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
-            # #let's find commited word that is less
-            # k = len(self.commited)-1
-            # while k>0 and self.commited[k][1] > l:
-            #    k -= 1
-            # t = self.commited[k][1]
-            # logger.debug("chunking segment")
-            # self.chunk_at(t)
-        logger.debug(f"len of buffer now: {len(self.audio_buffer) / self.SAMPLING_RATE:2.2f}")
-        return self.to_flush(o)
-    def chunk_completed_sentence(self):
-        if self.commited == []: return
-        logger.debug(self.commited)
-        sents = self.words_to_sentences(self.commited)
-        for s in sents:
-            logger.debug(f"\t\tSENT: {s}")
-        if len(sents) < 2:
-            return
-        while len(sents) > 2:
-            sents.pop(0)
-        # we will continue with audio processing at this timestamp
-        chunk_at = sents[-2][1]
-        logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
-        self.chunk_at(chunk_at)
-    def chunk_completed_segment(self, res):
-        if self.commited == []: return
-        ends = self.asr.segments_end_ts(res)
-        t = self.commited[-1][1]
-        if len(ends) > 1:
-            e = ends[-2] + self.buffer_time_offset
-            while len(ends) > 2 and e > t:
-                ends.pop(-1)
-                e = ends[-2] + self.buffer_time_offset
-            if e <= t:
-                logger.debug(f"--- segment chunked at {e:2.2f}")
-                self.chunk_at(e)
-            else:
-                logger.debug(f"--- last segment not within commited area")
-        else:
-            logger.debug(f"--- not enough segments to chunk")
-    def chunk_at(self, time):
-        """trims the hypothesis and audio buffer at "time"
-        """
-        self.transcript_buffer.pop_commited(time)
-        cut_seconds = time - self.buffer_time_offset
-        self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE):]
-        self.buffer_time_offset = time
-    def words_to_sentences(self, words):
-        """Uses self.tokenizer for sentence segmentation of words.
-        Returns: [(beg,end,"sentence 1"),...]
-        """
-        cwords = [w for w in words]
-        t = " ".join(o[2] for o in cwords)
-        s = self.tokenizer.split(t)
-        out = []
-        while s:
-            beg = None
-            end = None
-            sent = s.pop(0).strip()
-            fsent = sent
-            while cwords:
-                b, e, w = cwords.pop(0)
-                w = w.strip()
-                if beg is None and sent.startswith(w):
-                    beg = b
-                elif end is None and sent == w:
-                    end = e
-                    out.append((beg, end, fsent))
-                    break
-                sent = sent[len(w):].strip()
-        return out
-    def finish(self):
-        """Flush the incomplete text when the whole processing ends.
-        Returns: the same format as self.process_iter()
-        """
-        o = self.transcript_buffer.complete()
-        f = self.to_flush(o)
-        logger.debug(f"last, noncommited: {f}")
-        self.buffer_time_offset += len(self.audio_buffer) / 16000
-        return f
-    def to_flush(self, sents, sep=None, offset=0, ):
-        # concatenates the timestamped words or sentences into one sequence that is flushed in one line
-        # sents: [(beg1, end1, "sentence1"), ...] or [] if empty
-        # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
-        if sep is None:
-            sep = self.asr.sep
-        t = sep.join(s[2] for s in sents)
-        if len(sents) == 0:
-            b = None
-            e = None
-        else:
-            b = offset + sents[0][0]
-            e = offset + sents[-1][1]
-        return (b, e, t)
-###changed for VAC
-class VACOnlineASRProcessor(OnlineASRProcessor):
-    def __init__(self, online_chunk_size, *a, **kw):
-        self.online_chunk_size = online_chunk_size
-        self.online = OnlineASRProcessor(*a, **kw)
-        #self.vac = VoiceActivityController(use_vad_result=False)
-        self.logfile = self.online.logfile
-        self.init()
-    def init(self):
-        self.online.init()
-        self.vac.reset_states()
-        self.current_online_chunk_buffer_size = 0
-        self.is_currently_final = False
-    def insert_audio_chunk(self, audio):
-        logger.debug(f"In Vac:Initial audio chunk size: {len(audio)} samples")
-        r = self.vac.detect_speech_iter(audio, audio_in_int16=False)
-        audio, is_final = r
-        print(is_final)
-        self.is_currently_final = is_final
-        self.online.insert_audio_chunk(audio)
-        self.current_online_chunk_buffer_size += len(audio)
-    def process_iter(self):
-        if self.is_currently_final:
-            return self.finish()
-        elif self.current_online_chunk_buffer_size > SAMPLING_RATE * self.online_chunk_size:
-            self.current_online_chunk_buffer_size = 0
-            ret = self.online.process_iter()
-            return ret
-        else:
-            print("no online update, only VAD", file=self.logfile)
-            return (None, None, "")
-    def finish(self):
-        ret = self.online.finish()
-        self.online.init(keep_offset=True)
-        self.current_online_chunk_buffer_size = 0
-        return ret
-    '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
-    It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
-    it runs VAD and continuously detects whether there is speech or not.
-    When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
-    '''
-WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
-    ",")
-def add_shared_args(parser):
-    """shared args for simulation (this entry point) and server
-    parser: argparse.ArgumentParser object
-    """
-    parser.add_argument('--min-chunk-size', type=float, default=1.0,
-                        help='Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.')
-    parser.add_argument('--model', type=str, default='large-v2',
-                        choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large".split(
-                            ","),
-                        help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.")
-    parser.add_argument('--model_cache_dir', type=str, default=None,
-                        help="Overriding the default model cache dir where models downloaded from the hub are saved")
-    parser.add_argument('--model_dir', type=str, default=None,
-                        help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
-    parser.add_argument('--lan', '--language', type=str, default='auto',
-                        help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
-    parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe", "translate"],
-                        help="Transcribe or translate.")
-    parser.add_argument('--backend', type=str, default="faster-whisper",
-                        choices=["faster-whisper", "whisper_timestamped", "openai-api"],
-                        help='Load only this backend for Whisper processing.')
-    parser.add_argument('--vac', action="store_true", default=False,
-                        help='Use VAC = voice activity controller. Recommended. Requires torch.')
-    parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
-    parser.add_argument('--vad', action="store_true", default=False,
-                        help='Use VAD = voice activity detection, with the default parameters.')
-    parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],
-                        help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
-    parser.add_argument('--buffer_trimming_sec', type=float, default=15,
-                        help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
-    parser.add_argument("-l", "--log-level", dest="log_level",
-                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the log level",
-                        default='DEBUG')
-def asr_factory(args, logfile=sys.stderr):
-    """
-    Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
-    """
-    backend = args.backend
-    # if backend == "openai-api":
-    logger.debug("Using ivrit-ai.")
-    asr = IvritOnRunPodASR(lan=args.lan, api_key=RUN_POD_API_KEY, endpoint_id=RUNPOD_ENDPOINT_ID)
-    # Apply common configurations
-    if getattr(args, 'vad', False):  # Checks if VAD argument is present and True
-        logger.info("Setting VAD filter")
-        asr.use_vad()
-    language = args.lan
-    if args.task == "translate":
-        asr.set_translate_task()
-        tgt_language = "en"  # Whisper translates into English
-    else:
-        tgt_language = language  # Whisper transcribes in this language
-    # # Create the tokenizer
-    # if args.buffer_trimming == "sentence":
-    #     tokenizer = create_tokenizer(tgt_language)
-    # else:
-    tokenizer = None
-    # Create the OnlineASRProcessor
-    if args.vac:
-        online = VACOnlineASRProcessor(args.min_chunk_size, asr, tokenizer, logfile=logfile,
-                                       buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
-    else:
-        online = OnlineASRProcessor(asr, tokenizer, logfile=logfile,
-                                    buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
-    return asr, online
-def set_logging(args, logger, other="_server"):
-    logging.basicConfig(  # format='%(name)s
-        format='%(levelname)s\t%(message)s')
-    logger.setLevel(args.log_level)
-    logging.getLogger("whisper_online" + other).setLevel(args.log_level)
-#    logging.getLogger("whisper_online_server").setLevel(args.log_level)
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('audio_path', type=str,
-                        help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
-    add_shared_args(parser)
-    parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
-    parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
-    parser.add_argument('--comp_unaware', action="store_true", default=False,
-                        help='Computationally unaware simulation.')
-    args = parser.parse_args()
-    # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
-    logfile = sys.stderr
-    if args.offline and args.comp_unaware:
-        logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
-        sys.exit(1)
-    #    if args.log_level:
-    #        logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
-    #                            level=getattr(logging, args.log_level))
-    set_logging(args, logger)
-    audio_path = args.audio_path
-    SAMPLING_RATE = 16000
-    duration = len(load_audio(audio_path)) / SAMPLING_RATE
-    logger.info("Audio duration is: %2.2f seconds" % duration)
-    asr, online = asr_factory(args, logfile=logfile)
-    if args.vac:
-        min_chunk = args.vac_chunk_size
-    else:
-        min_chunk = args.min_chunk_size
-    # load the audio into the LRU cache before we start the timer
-    a = load_audio_chunk(audio_path, 0, 1)
-    # warm up the ASR because the very first transcribe takes much more time than the other
-    asr.transcribe(a)
-    beg = args.start_at
-    start = time.time() - beg
-    def output_transcript(o, now=None):
-        # output format in stdout is like:
-        # 4186.3606 0 1720 Takhle to je
-        # - the first three words are:
-        #    - emission time from beginning of processing, in milliseconds
-        #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
-        # - the next words: segment transcript
-        if now is None:
-            now = time.time() - start
-        if o[0] is not None:
-            print("%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]), file=logfile, flush=True)
-            print("%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]), flush=True)
-        else:
-            # No text, so no output
-            pass
-    if args.offline:  ## offline mode processing (for testing/debugging)
-        a = load_audio(audio_path)
-        online.insert_audio_chunk(a)
-        try:
-            o = online.process_iter()
-        except AssertionError as e:
-            logger.error(f"assertion error: {repr(e)}")
-        else:
-            output_transcript(o)
-        now = None
-    elif args.comp_unaware:  # computational unaware mode
-        end = beg + min_chunk
-        while True:
-            a = load_audio_chunk(audio_path, beg, end)
-            online.insert_audio_chunk(a)
-            try:
-                o = online.process_iter()
-            except AssertionError as e:
-                logger.error(f"assertion error: {repr(e)}")
-                pass
-            else:
-                output_transcript(o, now=end)
-            logger.debug(f"## last processed {end:.2f}s")
-            if end >= duration:
-                break
-            beg = end
-            if end + min_chunk > duration:
-                end = duration
-            else:
-                end += min_chunk
-        now = duration
-    else:  # online = simultaneous mode
-        end = 0
-        while True:
-            now = time.time() - start
-            if now < end + min_chunk:
-                time.sleep(min_chunk + end - now)
-            end = time.time() - start
-            a = load_audio_chunk(audio_path, beg, end)
-            beg = end
-            online.insert_audio_chunk(a)
-            try:
-                o = online.process_iter()
-            except AssertionError as e:
-                logger.error(f"assertion error: {e}")
-                pass
-            else:
-                output_transcript(o)
-            now = time.time() - start
-            logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now - end:.2f}")
-            if end >= duration:
-                break
-        now = None
-    o = online.finish()
-    output_transcript(o, now=now)