WajeehAzeemX commited on
Commit
2debb03
·
1 Parent(s): 57a0bba
Files changed (3) hide show
  1. Dockerfile +13 -0
  2. app.py +74 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request, HTTPException
2
+ import torch
3
+ import torchaudio
4
+ from transformers import AutoProcessor, pipeline
5
+ import io
6
+ from pydub import AudioSegment
7
+ from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
8
+ import numpy as np
9
+ import uvicorn
10
+ app = FastAPI()
11
+
12
+ # Device configuration
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(device)
15
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
16
+
17
+ # Load the model and processor
18
+ model_id = "WajeehAzeemX/whisper-small-ar2_onnx"
19
+ model = ORTModelForSpeechSeq2Seq.from_pretrained(
20
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
21
+ )
22
+ processor = AutoProcessor.from_pretrained(model_id)
23
+
24
+
25
+ pipe = pipeline(
26
+ "automatic-speech-recognition",
27
+ model=model,
28
+ tokenizer=processor.tokenizer,
29
+ feature_extractor=processor.feature_extractor,
30
+ torch_dtype=torch_dtype,
31
+ )
32
+
33
+ @app.post("/transcribe/")
34
+ async def transcribe_audio(request: Request):
35
+ try:
36
+ # Read binary data from the request
37
+ audio_data = await request.body()
38
+
39
+ # Convert binary data to a file-like object
40
+ audio_file = io.BytesIO(audio_data)
41
+
42
+ # Load the audio file using pydub
43
+ try:
44
+ audio_segment = AudioSegment.from_file(audio_file, format="wav")
45
+ except Exception as e:
46
+ raise HTTPException(status_code=400, detail=f"Error loading audio file: {str(e)}")
47
+
48
+ # Convert to mono if the audio is stereo (multi-channel)
49
+ if audio_segment.channels > 1:
50
+ audio_segment = audio_segment.set_channels(1)
51
+
52
+ # Resample the audio to 16kHz
53
+ target_sample_rate = 16000
54
+ if audio_segment.frame_rate != target_sample_rate:
55
+ audio_segment = audio_segment.set_frame_rate(target_sample_rate)
56
+
57
+ # Convert audio to numpy array
58
+ audio_array = np.array(audio_segment.get_array_of_samples())
59
+ if audio_segment.sample_width == 2:
60
+ audio_array = audio_array.astype(np.float32) / 32768.0
61
+ else:
62
+ raise HTTPException(status_code=400, detail="Unsupported sample width")
63
+
64
+ # Convert to the format expected by the model
65
+ inputs = processor(audio_array, sampling_rate=target_sample_rate, return_tensors="pt")
66
+ inputs = inputs.to(device)
67
+
68
+ # Get the transcription result
69
+ result = pipe(audio_array)
70
+ transcription = result["text"]
71
+
72
+ return {"transcription": transcription}
73
+ except Exception as e:
74
+ raise HTTPException(status_code=500, detail=str(e))
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ torchaudio
5
+ transformers
6
+ datasets[audio]
7
+ accelerate
8
+ pydub
9
+ numpy
10
+ onnx