rayhane123's picture
Upload 10 files
6c95da8 verified
raw
history blame
5.07 kB
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
import textwrap
import fitz # PyMuPDF for PDF handling
from docx import Document
import openpyxl # For Excel
from pptx import Presentation
from functools import lru_cache
import os
# Initialize FastAPI app
app = FastAPI()
# Set the correct path for static files
STATIC_DIR = r"C:\Users\User\doc_translation_service\translation\static"
# Ensure the static directory exists
if not os.path.exists(STATIC_DIR):
os.makedirs(STATIC_DIR)
# Mount static files (serves index.html)
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
@app.get("/", response_class=HTMLResponse)
async def read_root():
index_path = os.path.join(STATIC_DIR, "index.html")
try:
with open(index_path, "r", encoding="utf-8") as file:
return HTMLResponse(content=file.read())
except FileNotFoundError:
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
# Language codes (ISO 639-1)
LANGUAGE_CODES = {
"Anglais": "en",
"Francais": "fr",
"Arabe": "ar",
"Espagnol": "es",
}
# Available translation models
AVAILABLE_MODELS = {
"fr-en": "Helsinki-NLP/opus-mt-fr-en",
"en-fr": "Helsinki-NLP/opus-mt-en-fr",
"ar-en": "Helsinki-NLP/opus-mt-ar-en",
"en-ar": "Helsinki-NLP/opus-mt-en-ar",
"es-en": "Helsinki-NLP/opus-mt-es-en",
"en-es": "Helsinki-NLP/opus-mt-en-es",
}
# Cache model loading
@lru_cache(maxsize=10)
def load_translator(src_code: str, tgt_code: str):
model_key = f"{src_code}-{tgt_code}"
if model_key in AVAILABLE_MODELS:
return pipeline("translation", model=AVAILABLE_MODELS[model_key])
elif src_code != "en" and tgt_code != "en":
return (
pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
)
else:
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
# Split text into chunks
def chunk_text(text, max_length=400):
return textwrap.wrap(text, max_length)
# Extract text based on file type
def extract_text(file: UploadFile):
try:
if file.filename.endswith(".txt"):
return file.file.read().decode("utf-8")
elif file.filename.endswith(".pdf"):
doc = fitz.open(stream=file.file.read(), filetype="pdf")
return "\n".join([page.get_text() for page in doc])
elif file.filename.endswith(".docx"):
doc = Document(file.file)
return "\n".join([para.text for para in doc.paragraphs])
elif file.filename.endswith(".xlsx"):
wb = openpyxl.load_workbook(file.file)
text = ""
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows():
text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
return text
elif file.filename.endswith(".pptx"):
prs = Presentation(file.file)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
else:
raise HTTPException(status_code=400, detail="File type not supported.")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
@app.post("/upload/")
async def upload_file(
file: UploadFile = File(...),
src_lang: str = Form(...),
tgt_lang: str = Form(...)
):
text = extract_text(file)
if not text.strip():
raise HTTPException(status_code=400, detail="No text extracted from the file.")
src_code = LANGUAGE_CODES.get(src_lang)
tgt_code = LANGUAGE_CODES.get(tgt_lang)
if not src_code or not tgt_code:
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
try:
# Load translation model
translator = load_translator(src_code, tgt_code)
# If indirect translation via English
if isinstance(translator, tuple):
translator1, translator2 = translator
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
else:
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
return {"translated_text": translated_text}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")