Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
|
3 |
+
from fastapi import FastAPI, UploadFile, File
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
from transformers import pipeline
|
6 |
+
import traceback
|
7 |
+
import re
|
8 |
+
import uvicorn
|
9 |
+
|
10 |
+
app = FastAPI(title="Tacab ASR Somali API")
|
11 |
+
|
12 |
+
# Load ASR model
|
13 |
+
asr = pipeline(
|
14 |
+
"automatic-speech-recognition",
|
15 |
+
model="tacab/ASR_SOMALI",
|
16 |
+
tokenizer="tacab/ASR_SOMALI",
|
17 |
+
chunk_length_s=30,
|
18 |
+
stride_length_s=6,
|
19 |
+
return_timestamps="word",
|
20 |
+
device=-1
|
21 |
+
)
|
22 |
+
|
23 |
+
# Auto punctuation
|
24 |
+
def auto_punctuate(text):
|
25 |
+
text = text.strip()
|
26 |
+
# Capitalize sentences
|
27 |
+
def capitalize_sentences(text):
|
28 |
+
sentences = re.split(r'(?<=[.?!])\s+', text)
|
29 |
+
return '. '.join(s.strip().capitalize() for s in sentences if s)
|
30 |
+
|
31 |
+
if '.' not in text and len(text.split()) > 5:
|
32 |
+
text += '.'
|
33 |
+
|
34 |
+
words = text.split()
|
35 |
+
new_text = ""
|
36 |
+
for i in range(0, len(words), 10):
|
37 |
+
segment = " ".join(words[i:i+10])
|
38 |
+
new_text += segment.strip().capitalize() + ". "
|
39 |
+
|
40 |
+
return capitalize_sentences(new_text.strip())
|
41 |
+
|
42 |
+
@app.post("/transcribe")
|
43 |
+
async def transcribe(file: UploadFile = File(...)):
|
44 |
+
try:
|
45 |
+
# Save the uploaded file temporarily
|
46 |
+
temp_path = f"/tmp/{file.filename}"
|
47 |
+
with open(temp_path, "wb") as f:
|
48 |
+
f.write(await file.read())
|
49 |
+
|
50 |
+
# Transcribe
|
51 |
+
result = asr(temp_path)
|
52 |
+
raw_text = result.get("text", "").strip()
|
53 |
+
if not raw_text:
|
54 |
+
return JSONResponse({"error": "No transcription result."}, status_code=400)
|
55 |
+
|
56 |
+
# Punctuate
|
57 |
+
cleaned_text = auto_punctuate(raw_text)
|
58 |
+
|
59 |
+
return {"transcription": cleaned_text}
|
60 |
+
|
61 |
+
except Exception as e:
|
62 |
+
traceback.print_exc()
|
63 |
+
return JSONResponse({"error": str(e)}, status_code=500)
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|