stts

Running on TPU v5e

App Files Files Community

Afrinetwork7 commited on Aug 23

Commit

2300584

•

1 Parent(s): fbec879

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -115

app.py CHANGED Viewed

@@ -1,19 +1,16 @@
 import logging
 import math
-import os
-import tempfile
 import time
 from typing import Dict, Any
 from functools import wraps
-import yt_dlp as youtube_dl
-from fastapi import FastAPI, File, UploadFile, Depends, HTTPException
-from fastapi.responses import HTMLResponse
 from fastapi.encoders import jsonable_encoder
 from pydantic import BaseModel
 import jax.numpy as jnp
 import numpy as np
-from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
 from transformers.pipelines.audio_utils import ffmpeg_read
 from whisper_jax import FlaxWhisperPipline
@@ -33,7 +30,6 @@ BATCH_SIZE = 32
 CHUNK_LENGTH_S = 30
 NUM_PROC = 32
 FILE_LIMIT_MB = 10000
-YT_LENGTH_LIMIT_S = 15000  # limit to 2 hour YouTube files
 pipeline = FlaxWhisperPipline(checkpoint, dtype=jnp.bfloat16, batch_size=BATCH_SIZE)
 stride_length_s = CHUNK_LENGTH_S / 6
@@ -54,11 +50,7 @@ compile_time = time.time() - start
 logger.debug(f"Compiled in {compile_time}s")
 class TranscribeAudioRequest(BaseModel):
-    task: str = "transcribe"
-    return_timestamps: bool = False
-class TranscribeYouTubeRequest(BaseModel):
-    yt_url: str
     task: str = "transcribe"
     return_timestamps: bool = False
@@ -79,41 +71,33 @@ def timeit(func):
 @app.post("/transcribe_audio")
 @timeit
 async def transcribe_chunked_audio(
-    audio_file: UploadFile = File(...),
-    request: TranscribeAudioRequest = Depends()
 ) -> Dict[str, Any]:
     logger.debug("Starting transcribe_chunked_audio function")
     logger.debug(f"Received parameters - task: {request.task}, return_timestamps: {request.return_timestamps}")
-    logger.debug("Checking for audio file...")
-    if not audio_file:
-        logger.warning("No audio file")
-        raise HTTPException(status_code=400, detail="No audio file submitted!")
-    logger.debug(f"Audio file received: {audio_file.filename}")
     try:
-        # Read the file content
-        file_content = await audio_file.read()
-        file_size = len(file_content)
         file_size_mb = file_size / (1024 * 1024)
-        logger.debug(f"File size: {file_size} bytes ({file_size_mb:.2f}MB)")
     except Exception as e:
-        logger.error(f"Error reading file: {str(e)}", exc_info=True)
-        raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
     if file_size_mb > FILE_LIMIT_MB:
         logger.warning(f"Max file size exceeded: {file_size_mb:.2f}MB > {FILE_LIMIT_MB}MB")
         raise HTTPException(status_code=400, detail=f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.")
     try:
-        logger.debug("Performing ffmpeg read on audio file")
-        inputs = ffmpeg_read(file_content, pipeline.feature_extractor.sampling_rate)
         inputs = {"array": inputs, "sampling_rate": pipeline.feature_extractor.sampling_rate}
         logger.debug("ffmpeg read completed successfully")
     except Exception as e:
         logger.error(f"Error in ffmpeg read: {str(e)}", exc_info=True)
-        raise HTTPException(status_code=500, detail=f"Error processing audio file: {str(e)}")
     logger.debug("Calling tqdm_generate to transcribe audio")
     try:
@@ -130,51 +114,6 @@ async def transcribe_chunked_audio(
         "timing_info": timing_info
     })
-@app.post("/transcribe_youtube")
-@timeit
-async def transcribe_youtube(request: TranscribeYouTubeRequest) -> Dict[str, Any]:
-    logger.debug("Loading YouTube file...")
-    try:
-        html_embed_str = _return_yt_html_embed(request.yt_url)
-    except Exception as e:
-        logger.error("Error generating YouTube HTML embed:", exc_info=True)
-        raise HTTPException(status_code=500, detail="Error generating YouTube HTML embed")
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        filepath = os.path.join(tmpdirname, "video.mp4")
-        try:
-            logger.debug("Downloading YouTube audio...")
-            download_yt_audio(request.yt_url, filepath)
-        except Exception as e:
-            logger.error("Error downloading YouTube audio:", exc_info=True)
-            raise HTTPException(status_code=500, detail="Error downloading YouTube audio")
-        try:
-            logger.debug(f"Opening downloaded audio file: {filepath}")
-            with open(filepath, "rb") as f:
-                inputs = f.read()
-        except Exception as e:
-            logger.error("Error reading downloaded audio file:", exc_info=True)
-            raise HTTPException(status_code=500, detail="Error reading downloaded audio file")
-    inputs = ffmpeg_read(inputs, pipeline.feature_extractor.sampling_rate)
-    inputs = {"array": inputs, "sampling_rate": pipeline.feature_extractor.sampling_rate}
-    logger.debug("Done loading YouTube file")
-    try:
-        logger.debug("Calling tqdm_generate to transcribe YouTube audio")
-        text, runtime, timing_info = tqdm_generate(inputs, task=request.task, return_timestamps=request.return_timestamps)
-    except Exception as e:
-        logger.error("Error transcribing YouTube audio:", exc_info=True)
-        raise HTTPException(status_code=500, detail="Error transcribing YouTube audio")
-    return jsonable_encoder({
-        "html_embed": html_embed_str,
-        "text": text,
-        "runtime": runtime,
-        "timing_info": timing_info
-    })
 def tqdm_generate(inputs: dict, task: str, return_timestamps: bool):
     start_time = time.time()
     logger.debug(f"Starting tqdm_generate - task: {task}, return_timestamps: {return_timestamps}")
@@ -236,46 +175,6 @@ def tqdm_generate(inputs: dict, task: str, return_timestamps: bool):
         "total_processing_time": total_processing_time
     }
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
-    )
-    return HTML_str
-def download_yt_audio(yt_url, filename):
-    info_loader = youtube_dl.YoutubeDL()
-    try:
-        logger.debug(f"Extracting info for YouTube URL: {yt_url}")
-        info = info_loader.extract_info(yt_url, download=False)
-    except youtube_dl.utils.DownloadError as err:
-        logger.error("Error extracting YouTube info:", exc_info=True)
-        raise HTTPException(status_code=400, detail=str(err))
-    file_length = info["duration_string"]
-    file_h_m_s = file_length.split(":")
-    file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
-    if len(file_h_m_s) == 1:
-        file_h_m_s.insert(0, 0)
-    if len(file_h_m_s) == 2:
-        file_h_m_s.insert(0, 0)
-    file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
-    if file_length_s > YT_LENGTH_LIMIT_S:
-        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
-        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
-        raise HTTPException(status_code=400, detail=f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
-    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
-    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-        try:
-            logger.debug(f"Downloading YouTube audio to {filename}")
-            ydl.download([yt_url])
-        except youtube_dl.utils.ExtractorError as err:
-            logger.error("Error downloading YouTube audio:", exc_info=True)
-            raise HTTPException(status_code=400, detail=str(err))
 def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
     if seconds is not None:
         milliseconds = round(seconds * 1000.0)

 import logging
 import math
 import time
+import base64
+import io
 from typing import Dict, Any
 from functools import wraps
+from fastapi import FastAPI, Depends, HTTPException
 from fastapi.encoders import jsonable_encoder
 from pydantic import BaseModel
 import jax.numpy as jnp
 import numpy as np
 from transformers.pipelines.audio_utils import ffmpeg_read
 from whisper_jax import FlaxWhisperPipline
 CHUNK_LENGTH_S = 30
 NUM_PROC = 32
 FILE_LIMIT_MB = 10000
 pipeline = FlaxWhisperPipline(checkpoint, dtype=jnp.bfloat16, batch_size=BATCH_SIZE)
 stride_length_s = CHUNK_LENGTH_S / 6
 logger.debug(f"Compiled in {compile_time}s")
 class TranscribeAudioRequest(BaseModel):
+    audio_base64: str
     task: str = "transcribe"
     return_timestamps: bool = False
 @app.post("/transcribe_audio")
 @timeit
 async def transcribe_chunked_audio(
+    request: TranscribeAudioRequest
 ) -> Dict[str, Any]:
     logger.debug("Starting transcribe_chunked_audio function")
     logger.debug(f"Received parameters - task: {request.task}, return_timestamps: {request.return_timestamps}")
     try:
+        # Decode base64 audio data
+        audio_data = base64.b64decode(request.audio_base64)
+        file_size = len(audio_data)
         file_size_mb = file_size / (1024 * 1024)
+        logger.debug(f"Decoded audio data size: {file_size} bytes ({file_size_mb:.2f}MB)")
     except Exception as e:
+        logger.error(f"Error decoding base64 audio data: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=400, detail=f"Error decoding base64 audio data: {str(e)}")
     if file_size_mb > FILE_LIMIT_MB:
         logger.warning(f"Max file size exceeded: {file_size_mb:.2f}MB > {FILE_LIMIT_MB}MB")
         raise HTTPException(status_code=400, detail=f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.")
     try:
+        logger.debug("Performing ffmpeg read on audio data")
+        inputs = ffmpeg_read(audio_data, pipeline.feature_extractor.sampling_rate)
         inputs = {"array": inputs, "sampling_rate": pipeline.feature_extractor.sampling_rate}
         logger.debug("ffmpeg read completed successfully")
     except Exception as e:
         logger.error(f"Error in ffmpeg read: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error processing audio data: {str(e)}")
     logger.debug("Calling tqdm_generate to transcribe audio")
     try:
         "timing_info": timing_info
     })
 def tqdm_generate(inputs: dict, task: str, return_timestamps: bool):
     start_time = time.time()
     logger.debug(f"Starting tqdm_generate - task: {task}, return_timestamps: {return_timestamps}")
         "total_processing_time": total_processing_time
     }
 def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
     if seconds is not None:
         milliseconds = round(seconds * 1000.0)