Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 9, 2024

Commit

8e3c59e

1 Parent(s): 9bd82d6

infer wo runpod

Browse files

Files changed (2) hide show

infer.py +74 -48
whisper_online.py +116 -324

infer.py CHANGED Viewed

@@ -1,105 +1,131 @@
 import base64
 import faster_whisper
 import tempfile
-import logging
 import torch
-import sys
 import requests
-import os
-import whisper_online
-# Set up logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
-# Load the FasterWhisper model
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model_name = 'ivrit-ai/faster-whisper-v2-d3-e3'
-try:
-    lan = 'he'
-    logging.info(f"Attempting to initialize FasterWhisperASR with device: {device}")
-    model = whisper_online.FasterWhisperASR(lan=lan, modelsize=model_name)
-    logging.info("FasterWhisperASR model initialized successfully.")
-except Exception as e:
-    logging.error(f"Failed to initialize FasterWhisperASR model: {e}")
 # Maximum data size: 200MB
 MAX_PAYLOAD_SIZE = 200 * 1024 * 1024
 def download_file(url, max_size_bytes, output_filename, api_key=None):
-    """Download a file from a given URL with size limit and optional API key."""
     try:
         headers = {}
         if api_key:
             headers['Authorization'] = f'Bearer {api_key}'
         response = requests.get(url, stream=True, headers=headers)
         response.raise_for_status()
         file_size = int(response.headers.get('Content-Length', 0))
         if file_size > max_size_bytes:
-            print(f"File size exceeds the limit: {file_size} bytes.")
             return False
         downloaded_size = 0
         with open(output_filename, 'wb') as file:
             for chunk in response.iter_content(chunk_size=8192):
                 downloaded_size += len(chunk)
                 if downloaded_size > max_size_bytes:
-                    print(f"Download stopped: size limit exceeded.")
                     return False
                 file.write(chunk)
         print(f"File downloaded successfully: {output_filename}")
         return True
     except requests.RequestException as e:
         print(f"Error downloading file: {e}")
         return False
-def transcribe_core_whisper(audio_file):
-    """Transcribe the audio file using FasterWhisper."""
-    logging.info(f"Transcribing audio file: {audio_file}")
-    ret = {'segments': []}
-    try:
-        segs, dummy = model.transcribe(audio_file, language='he', word_timestamps=True)
-        for s in segs:
-            words = [{'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability} for w in s.words]
-            seg = {'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text, 'avg_logprob': s.avg_logprob,
-                   'compression_ratio': s.compression_ratio, 'no_speech_prob': s.no_speech_prob, 'words': words}
-            ret['segments'].append(seg)
-        logging.info("Transcription completed successfully.")
-    except Exception as e:
-        logging.error(f"Error during transcription: {e}", exc_info=True)
-    return ret
-def transcribe_whisper(job):
-    """Main transcription handler."""
-    logging.info(f"Processing job: {job}")
-    datatype = job.get('input', {}).get('type')
     if not datatype:
         return {"error": "datatype field not provided. Should be 'blob' or 'url'."}
     if datatype not in ['blob', 'url']:
-        return {"error": f"Invalid datatype: {datatype}."}
-    api_key = job.get('input', {}).get('api_key')
     with tempfile.TemporaryDirectory() as d:
         audio_file = f'{d}/audio.mp3'
         if datatype == 'blob':
             mp3_bytes = base64.b64decode(job['input']['data'])
-            with open(audio_file, 'wb') as f:
-                f.write(mp3_bytes)
         elif datatype == 'url':
             success = download_file(job['input']['url'], MAX_PAYLOAD_SIZE, audio_file, api_key)
             if not success:
-                return {"error": f"Failed to download from {job['input']['url']}"}
-        result = transcribe_core_whisper(audio_file)
         return {'result': result}
-# Example job input to test locally
 if __name__ == "__main__":
     test_job = {
         "input": {
             "type": "url",
             "url": "https://github.com/metaldaniel/HebrewASR-Comparison/raw/main/HaTankistiot_n12-mp3.mp3",
         }
     }
-    print(transcribe_whisper(test_job))

 import base64
 import faster_whisper
 import tempfile
 import torch
 import requests
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Load the model from Hugging Face
+model_name = 'ivrit-ai/faster-whisper-v2-d4'
+model = faster_whisper.WhisperModel(model_name, device=device)
 # Maximum data size: 200MB
 MAX_PAYLOAD_SIZE = 200 * 1024 * 1024
 def download_file(url, max_size_bytes, output_filename, api_key=None):
+    """
+    Download a file from a given URL with size limit and optional API key.
+    Args:
+    url (str): The URL of the file to download.
+    max_size_bytes (int): Maximum allowed file size in bytes.
+    output_filename (str): The name of the file to save the download as.
+    api_key (str, optional): API key to be used as a bearer token.
+    Returns:
+    bool: True if download was successful, False otherwise.
+    """
     try:
         headers = {}
         if api_key:
             headers['Authorization'] = f'Bearer {api_key}'
         response = requests.get(url, stream=True, headers=headers)
         response.raise_for_status()
         file_size = int(response.headers.get('Content-Length', 0))
         if file_size > max_size_bytes:
+            print(f"File size ({file_size} bytes) exceeds the maximum allowed size ({max_size_bytes} bytes).")
             return False
         downloaded_size = 0
         with open(output_filename, 'wb') as file:
             for chunk in response.iter_content(chunk_size=8192):
                 downloaded_size += len(chunk)
                 if downloaded_size > max_size_bytes:
+                    print(f"Download stopped: Size limit exceeded ({max_size_bytes} bytes).")
                     return False
                 file.write(chunk)
         print(f"File downloaded successfully: {output_filename}")
         return True
     except requests.RequestException as e:
         print(f"Error downloading file: {e}")
         return False
+def transcribe(job):
+    datatype = job['input'].get('type', None)
     if not datatype:
         return {"error": "datatype field not provided. Should be 'blob' or 'url'."}
     if datatype not in ['blob', 'url']:
+        return {"error": f"datatype should be 'blob' or 'url', but is {datatype} instead."}
+    api_key = job['input'].get('api_key', None)
     with tempfile.TemporaryDirectory() as d:
         audio_file = f'{d}/audio.mp3'
         if datatype == 'blob':
             mp3_bytes = base64.b64decode(job['input']['data'])
+            with open(audio_file, 'wb') as file:
+                file.write(mp3_bytes)
         elif datatype == 'url':
             success = download_file(job['input']['url'], MAX_PAYLOAD_SIZE, audio_file, api_key)
             if not success:
+                return {"error": f"Error downloading data from {job['input']['url']}"}
+        result = transcribe_core(audio_file)
         return {'result': result}
+def transcribe_core(audio_file):
+    print('Transcribing...')
+    ret = {'segments': []}
+    segs, _ = model.transcribe(audio_file, language='he', word_timestamps=True)
+    for s in segs:
+        words = []
+        for w in s.words:
+            words.append({
+                'start': w.start,
+                'end': w.end,
+                'word': w.word,
+                'probability': w.probability
+            })
+        seg = {
+            'id': s.id,
+            'seek': s.seek,
+            'start': s.start,
+            'end': s.end,
+            'text': s.text,
+            'avg_logprob': s.avg_logprob,
+            'compression_ratio': s.compression_ratio,
+            'no_speech_prob': s.no_speech_prob,
+            'words': words
+        }
+        print(seg)
+        ret['segments'].append(seg)
+    return ret
+# The script can be run directly or served using Hugging Face's Gradio app or API
 if __name__ == "__main__":
+    # For testing purposes, you can define a sample job and call the transcribe function
     test_job = {
         "input": {
             "type": "url",
             "url": "https://github.com/metaldaniel/HebrewASR-Comparison/raw/main/HaTankistiot_n12-mp3.mp3",
+            "api_key": "your_api_key_here"  # Optional, replace with actual key if needed
         }
     }
+    print(transcribe(test_job))

whisper_online.py CHANGED Viewed

@@ -5,15 +5,40 @@ import librosa
 from functools import lru_cache
 import time
 import logging
-import os
-import tempfile
 import io
 import soundfile as sf
 import math
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
 @lru_cache
@@ -35,6 +60,7 @@ class ASRBase:
     sep = " "  # join transcribe words with this character (" " for whisper_timestamped,
     # "" for faster-whisper because it emits the spaces when neeeded)
     def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
         self.logfile = logfile
@@ -56,217 +82,72 @@ class ASRBase:
         raise NotImplemented("must be implemented in the child class")
-# class WhisperTimestampedASR(ASRBase):
-#     """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
-#     On the other hand, the installation for GPU could be easier.
-#     """
-#
-#     sep = " "
-#
-#     def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
-#         import whisper
-#         import whisper_timestamped
-#         from whisper_timestamped import transcribe_timestamped
-#         self.transcribe_timestamped = transcribe_timestamped
-#         if model_dir is not None:
-#             logger.debug("ignoring model_dir, not implemented")
-#         return whisper.load_model(modelsize, download_root=cache_dir)
-#
-#     def transcribe(self, audio, init_prompt=""):
-#         result = self.transcribe_timestamped(self.model,
-#                                              audio, language=self.original_language,
-#                                              initial_prompt=init_prompt, verbose=None,
-#                                              condition_on_previous_text=True, **self.transcribe_kargs)
-#         return result
-#
-#     def ts_words(self, r):
-#         # return: transcribe result object to [(beg,end,"word1"), ...]
-#         o = []
-#         for s in r["segments"]:
-#             for w in s["words"]:
-#                 t = (w["start"], w["end"], w["text"])
-#                 o.append(t)
-#         return o
-#
-#     def segments_end_ts(self, res):
-#         return [s["end"] for s in res["segments"]]
-#
-#     def use_vad(self):
-#         self.transcribe_kargs["vad"] = True
-#
-#     def set_translate_task(self):
-#         self.transcribe_kargs["task"] = "translate"
-#
-class FasterWhisperASR(ASRBase):
-    logging.info(f"In faster whisper ASR")
-    """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
-    """
-    sep = ""
-    def load_model(self, modelsize=None, cache_dir="/tmp/.cache/huggingface", model_dir=None):
-        from faster_whisper import WhisperModel
-        #        logging.getLogger("faster_whisper").setLevel(logger.level)
-        logging.info("Starting model loading process...")
-        logging.info(f"Model loading parameters - modelsize: {modelsize}, cache_dir: {cache_dir}, model_dir: {model_dir}")
-        if model_dir is not None:
-            logger.info(
-                f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
-            model_size_or_path = model_dir
-        elif modelsize is not None:
-            model_size_or_path = modelsize
-        else:
-            raise ValueError("modelsize or model_dir parameter must be set")
-        try:
-            logging.info(f"Loading WhisperModel on device: ")
-            os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/.cache/sentence_transformers'
-            os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
-            # Ensure the cache directory exists
-            os.makedirs(cache_dir, exist_ok=True)
-            model = WhisperModel(model_size_or_path, device="cuda", compute_type="float16", download_root=cache_dir)
-            logging.info("Model loaded successfully.")
-        except Exception as e:
-            logging.error(f"An error occurred while loading the model: {e}", exc_info=True)
-            raise
-        # this worked fast and reliably on NVIDIA L40
-        #model = WhisperModel(model_size_or_path, device="cuda", compute_type="float16", download_root=cache_dir)
-        # or run on GPU with INT8
-        # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
-        # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
-        # or run on CPU with INT8
-        # tested: works, but slow, appx 10-times than cuda FP16
-        #        model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
-        return model
-    def transcribe(self, audio, init_prompt=""):
-        logging.info("Starting transcription process...")
-        logging.debug(f"Transcription parameters - language: {self.original_language}, initial_prompt: '{init_prompt}'")
-        try:
-            # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
-            segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt,
-                                                   beam_size=5, word_timestamps=True, condition_on_previous_text=True,
-                                                   **self.transcribe_kargs)
-            logging.info("Transcription completed successfully.")
-            logging.debug(f"Transcription info: {info}")
-        except Exception as e:
-            logging.error(f"An error occurred during transcription: {e}", exc_info=True)
-            raise
-        return list(segments)
     def ts_words(self, segments):
         o = []
         for segment in segments:
-            for word in segment.words:
-                if segment.no_speech_prob > 0.9:
-                    continue
-                # not stripping the spaces -- should not be merged with them!
-                w = word.word
-                t = (word.start, word.end, w)
-                o.append(t)
         return o
     def segments_end_ts(self, res):
-        return [s.end for s in res]
     def use_vad(self):
-        self.transcribe_kargs["vad_filter"] = True
     def set_translate_task(self):
-        self.transcribe_kargs["task"] = "translate"
-# class OpenaiApiASR(ASRBase):
-#     """Uses OpenAI's Whisper API for audio transcription."""
-#
-#     def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
-#         self.logfile = logfile
-#
-#         self.modelname = "whisper-1"
-#         self.original_language = None if lan == "auto" else lan  # ISO-639-1 language code
-#         self.response_format = "verbose_json"
-#         self.temperature = temperature
-#
-#         self.load_model()
-#
-#         self.use_vad_opt = False
-#
-#         # reset the task in set_translate_task
-#         self.task = "transcribe"
-#
-#     def load_model(self, *args, **kwargs):
-#         from openai import OpenAI
-#         self.client = OpenAI()
-#
-#         self.transcribed_seconds = 0  # for logging how many seconds were processed by API, to know the cost
-#
-#     def ts_words(self, segments):
-#         no_speech_segments = []
-#         if self.use_vad_opt:
-#             for segment in segments.segments:
-#                 # TODO: threshold can be set from outside
-#                 if segment["no_speech_prob"] > 0.8:
-#                     no_speech_segments.append((segment.get("start"), segment.get("end")))
-#
-#         o = []
-#         for word in segments.words:
-#             start = word.get("start")
-#             end = word.get("end")
-#             if any(s[0] <= start <= s[1] for s in no_speech_segments):
-#                 # print("Skipping word", word.get("word"), "because it's in a no-speech segment")
-#                 continue
-#             o.append((start, end, word.get("word")))
-#         return o
-#
-#     def segments_end_ts(self, res):
-#         return [s["end"] for s in res.words]
-#
-#     def transcribe(self, audio_data, prompt=None, *args, **kwargs):
-#         # Write the audio data to a buffer
-#         buffer = io.BytesIO()
-#         buffer.name = "temp.wav"
-#         sf.write(buffer, audio_data, samplerate=16000, format='WAV', subtype='PCM_16')
-#         buffer.seek(0)  # Reset buffer's position to the beginning
-#
-#         self.transcribed_seconds += math.ceil(len(audio_data) / 16000)  # it rounds up to the whole seconds
-#
-#         params = {
-#             "model": self.modelname,
-#             "file": buffer,
-#             "response_format": self.response_format,
-#             "temperature": self.temperature,
-#             "timestamp_granularities": ["word", "segment"]
-#         }
-#         if self.task != "translate" and self.original_language:
-#             params["language"] = self.original_language
-#         if prompt:
-#             params["prompt"] = prompt
-#
-#         if self.task == "translate":
-#             proc = self.client.audio.translations
-#         else:
-#             proc = self.client.audio.transcriptions
-#
-#         # Process transcription/translation
-#         transcript = proc.create(**params)
-#         logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
-#
-#         return transcript
-#
-#     def use_vad(self):
-#         self.use_vad_opt = True
-#
-#     def set_translate_task(self):
-#         self.task = "translate"
-#
 class HypothesisBuffer:
@@ -424,14 +305,14 @@ class OnlineASRProcessor:
         if len(self.audio_buffer) / self.SAMPLING_RATE > s:
             self.chunk_completed_segment(res)
-            # alternative: on any word
             # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
-            # let's find commited word that is less
             # k = len(self.commited)-1
             # while k>0 and self.commited[k][1] > l:
             #    k -= 1
             # t = self.commited[k][1]
-            logger.debug("chunking segment")
             # self.chunk_at(t)
         logger.debug(f"len of buffer now: {len(self.audio_buffer) / self.SAMPLING_RATE:2.2f}")
@@ -534,134 +415,60 @@ class OnlineASRProcessor:
         return (b, e, t)
 class VACOnlineASRProcessor(OnlineASRProcessor):
-    '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
-    It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
-    it runs VAD and continuously detects whether there is speech or not.
-    When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
-    '''
     def __init__(self, online_chunk_size, *a, **kw):
         self.online_chunk_size = online_chunk_size
         self.online = OnlineASRProcessor(*a, **kw)
-        # VAC:
-        import torch
-        model, _ = torch.hub.load(
-            repo_or_dir='snakers4/silero-vad',
-            model='silero_vad'
-        )
-        #from silero_vad import VADIterator
-        #self.vac = VADIterator(model)  # we use all the default options: 500ms silence, etc.
         self.logfile = self.online.logfile
         self.init()
     def init(self):
         self.online.init()
         self.vac.reset_states()
         self.current_online_chunk_buffer_size = 0
         self.is_currently_final = False
-        self.status = None  # or "voice" or "nonvoice"
-        self.audio_buffer = np.array([], dtype=np.float32)
-        self.buffer_offset = 0  # in frames
-    def clear_buffer(self):
-        self.buffer_offset += len(self.audio_buffer)
-        self.audio_buffer = np.array([], dtype=np.float32)
     def insert_audio_chunk(self, audio):
-        res = self.vac(audio)
-        self.audio_buffer = np.append(self.audio_buffer, audio)
-        if res is not None:
-            frame = list(res.values())[0]
-            if 'start' in res and 'end' not in res:
-                self.status = 'voice'
-                send_audio = self.audio_buffer[frame - self.buffer_offset:]
-                self.online.init(offset=frame / self.SAMPLING_RATE)
-                self.online.insert_audio_chunk(send_audio)
-                self.current_online_chunk_buffer_size += len(send_audio)
-                self.clear_buffer()
-            elif 'end' in res and 'start' not in res:
-                self.status = 'nonvoice'
-                send_audio = self.audio_buffer[:frame - self.buffer_offset]
-                self.online.insert_audio_chunk(send_audio)
-                self.current_online_chunk_buffer_size += len(send_audio)
-                self.is_currently_final = True
-                self.clear_buffer()
-            else:
-                # It doesn't happen in the current code.
-                raise NotImplemented("both start and end of voice in one chunk!!!")
-        else:
-            if self.status == 'voice':
-                self.online.insert_audio_chunk(self.audio_buffer)
-                self.current_online_chunk_buffer_size += len(self.audio_buffer)
-                self.clear_buffer()
-            else:
-                # We keep 1 second because VAD may later find start of voice in it.
-                # But we trim it to prevent OOM.
-                self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
-                self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
     def process_iter(self):
         if self.is_currently_final:
             return self.finish()
-        elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE * self.online_chunk_size:
             self.current_online_chunk_buffer_size = 0
             ret = self.online.process_iter()
             return ret
         else:
-            print("no online update, only VAD", self.status, file=self.logfile)
             return (None, None, "")
     def finish(self):
         ret = self.online.finish()
         self.current_online_chunk_buffer_size = 0
-        self.is_currently_final = False
         return ret
-WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
-    ",")
-def create_tokenizer(lan):
-    """returns an object that has split function that works like the one of MosesTokenizer"""
-    assert lan in WHISPER_LANG_CODES, "language must be Whisper's supported lang code: " + " ".join(WHISPER_LANG_CODES)
-    if lan == "uk":
-        import tokenize_uk
-        class UkrainianTokenizer:
-            def split(self, text):
-                return tokenize_uk.tokenize_sents(text)
-        return UkrainianTokenizer()
-    # supported by fast-mosestokenizer
-    # if lan in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split():
-    # #from mosestokenizer import MosesTokenizer
-    # #return MosesTokenizer(lan)
-    # the following languages are in Whisper, but not in wtpsplit:
-    if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
-        logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
-        lan = None
-    #from wtpsplit import WtP
-    # downloads the model from huggingface on the first use
-    #wtp = WtP("wtp-canine-s-12l-no-adapters")
-    # class WtPtok:
-    #     def split(self, sent):
-    #         #return wtp.split(sent, lang_code=lan)
-    #
-    # return WtPtok()
 def add_shared_args(parser):
@@ -704,24 +511,9 @@ def asr_factory(args, logfile=sys.stderr):
     Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
     """
     backend = args.backend
-    if backend == "openai-api":
-        logger.debug("Using OpenAI API.")
-        #asr = OpenaiApiASR(lan=args.lan)
-    else:
-        if backend == "faster-whisper":
-            logger.debug("Using FasterWhisper.")
-            print("using faster-whisper from whisper-online")
-            asr_cls = FasterWhisperASR
-        #else:
-            #asr_cls = WhisperTimestampedASR
-        # Only for FasterWhisperASR and WhisperTimestampedASR
-        size = args.model
-        t = time.time()
-        logger.info(f"Loading Whisper {size} model for {args.lan}...")
-        asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
-        e = time.time()
-        logger.info(f"done. It took {round(e - t, 2)} seconds.")
     # Apply common configurations
     if getattr(args, 'vad', False):  # Checks if VAD argument is present and True
@@ -735,11 +527,11 @@ def asr_factory(args, logfile=sys.stderr):
     else:
         tgt_language = language  # Whisper transcribes in this language
-    # Create the tokenizer
-    if args.buffer_trimming == "sentence":
-        tokenizer = create_tokenizer(tgt_language)
-    else:
-        tokenizer = None
     # Create the OnlineASRProcessor
     if args.vac:
@@ -892,4 +684,4 @@ if __name__ == "__main__":
         now = None
     o = online.finish()
-    output_transcript(o, now=now)

 from functools import lru_cache
 import time
 import logging
+import runpod
+import base64
 import io
 import soundfile as sf
 import math
+import os
+from dotenv import load_dotenv
+import openai
+#from voice_activity_controller import *
+load_dotenv('.env')
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+RUN_POD_API_KEY = os.getenv('RUN_POD_API_KEY')
+RUNPOD_ENDPOINT_ID = os.getenv('RUNPOD_ENDPOINT_ID')
+openai.api_key = OPENAI_API_KEY
+# Set up basic configuration for logging
+logging.basicConfig(
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    level=logging.DEBUG  # Set to DEBUG to capture all levels of log messages
+)
+# Use the root logger directly
+log = logging.getLogger(__name__)
+if not OPENAI_API_KEY:
+    log.error("API key not found. Please set the OPENAI_API_KEY environment variable.")
+    sys.exit(1)
+log.debug(f"Using API Key: {OPENAI_API_KEY[:5]}...")
+from faster_whisper import WhisperModel
 logger = logging.getLogger(__name__)
 @lru_cache
     sep = " "  # join transcribe words with this character (" " for whisper_timestamped,
     # "" for faster-whisper because it emits the spaces when neeeded)
     def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
         self.logfile = logfile
         raise NotImplemented("must be implemented in the child class")
+class IvritOnRunPodASR(ASRBase):
+    """Uses ivrit-ai API for audio transcription."""
+    def __init__(self, lan=None, api_key=None, endpoint_id=None, logfile=sys.stderr):
+        self.logfile = logfile
+        self.original_language = None if lan == "auto" else lan  # ISO-639-1 language code
+        if api_key is None or endpoint_id is None:
+            raise ValueError("API key and Endpoint ID must be provided for Runpod API")
+        runpod.api_key = api_key
+        self.endpoint = runpod.Endpoint(endpoint_id)
+        self.transcribed_seconds = 0  # For logging how many seconds were processed by API, to know the cost
+        self.use_vad_opt = False
     def ts_words(self, segments):
+        if not segments:  # Check if segments is empty
+            logger.warning("No segments found in the response.")
+            return []
+        no_speech_segments = []
+        if self.use_vad_opt:
+            for segment in segments:
+                if segment["no_speech_prob"] > 0.8:
+                    no_speech_segments.append((segment.get("start"), segment.get("end")))
         o = []
         for segment in segments:
+            # Checking if 'word' is part of the segment and then processing it
+            start = segment.get("start")
+            end = segment.get("end")
+            text = segment.get("text", "")  # Assuming each segment is a dictionary with a 'word' key
+            if text and not any(s[0] <= start <= s[1] for s in no_speech_segments):
+                o.append((start, end, text))
         return o
     def segments_end_ts(self, res):
+        return [s["end"] for s in res]
+    def transcribe(self, audio_data, prompt=None, *args, **kwargs):
+        # Write the audio data to a buffer
+        buffer = io.BytesIO()
+        buffer.name = "temp.wav"
+        sf.write(buffer, audio_data, samplerate=16000, format='WAV', subtype='PCM_16')
+        buffer.seek(0)  # Reset buffer's position to the beginning
+        self.transcribed_seconds += math.ceil(len(audio_data) / 16000)  # it rounds up to the whole seconds
+        # Convert the audio to base64
+        audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+        payload = {
+            'type': 'blob',
+            'data': audio_base64
+        }
+        try:
+            # Send the request to Runpod API
+            res = self.endpoint.run_sync(payload)
+            # res['result']
+            # logger.debug(f"Transcription response: {res}")  # Debugging line ##THIS CAUSES TO OUTPUT THE JUNK
+        except Exception as e:
+            logger.error(f"Failed to transcribe audio with Runpod API: {e}")
+            return None
+        segments = res.get('result', {}).get('segments', [])
+        return segments
     def use_vad(self):
+        self.use_vad_opt = False
     def set_translate_task(self):
+        self.task = "translate"
 class HypothesisBuffer:
         if len(self.audio_buffer) / self.SAMPLING_RATE > s:
             self.chunk_completed_segment(res)
+            # #alternative: on any word
             # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
+            # #let's find commited word that is less
             # k = len(self.commited)-1
             # while k>0 and self.commited[k][1] > l:
             #    k -= 1
             # t = self.commited[k][1]
+            # logger.debug("chunking segment")
             # self.chunk_at(t)
         logger.debug(f"len of buffer now: {len(self.audio_buffer) / self.SAMPLING_RATE:2.2f}")
         return (b, e, t)
+###changed for VAC
 class VACOnlineASRProcessor(OnlineASRProcessor):
     def __init__(self, online_chunk_size, *a, **kw):
         self.online_chunk_size = online_chunk_size
         self.online = OnlineASRProcessor(*a, **kw)
+        #self.vac = VoiceActivityController(use_vad_result=False)
         self.logfile = self.online.logfile
         self.init()
     def init(self):
         self.online.init()
         self.vac.reset_states()
         self.current_online_chunk_buffer_size = 0
         self.is_currently_final = False
     def insert_audio_chunk(self, audio):
+        logger.debug(f"In Vac:Initial audio chunk size: {len(audio)} samples")
+        r = self.vac.detect_speech_iter(audio, audio_in_int16=False)
+        audio, is_final = r
+        print(is_final)
+        self.is_currently_final = is_final
+        self.online.insert_audio_chunk(audio)
+        self.current_online_chunk_buffer_size += len(audio)
     def process_iter(self):
         if self.is_currently_final:
             return self.finish()
+        elif self.current_online_chunk_buffer_size > SAMPLING_RATE * self.online_chunk_size:
             self.current_online_chunk_buffer_size = 0
             ret = self.online.process_iter()
             return ret
         else:
+            print("no online update, only VAD", file=self.logfile)
             return (None, None, "")
     def finish(self):
         ret = self.online.finish()
+        self.online.init(keep_offset=True)
         self.current_online_chunk_buffer_size = 0
         return ret
+    '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
+    It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
+    it runs VAD and continuously detects whether there is speech or not.
+    When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
+    '''
+WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
+    ",")
 def add_shared_args(parser):
     Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
     """
     backend = args.backend
+    # if backend == "openai-api":
+    logger.debug("Using ivrit-ai.")
+    asr = IvritOnRunPodASR(lan=args.lan, api_key=RUN_POD_API_KEY, endpoint_id=RUNPOD_ENDPOINT_ID)
     # Apply common configurations
     if getattr(args, 'vad', False):  # Checks if VAD argument is present and True
     else:
         tgt_language = language  # Whisper transcribes in this language
+    # # Create the tokenizer
+    # if args.buffer_trimming == "sentence":
+    #     tokenizer = create_tokenizer(tgt_language)
+    # else:
+    tokenizer = None
     # Create the OnlineASRProcessor
     if args.vac:
         now = None
     o = online.finish()
+    output_transcript(o, now=now)