|
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
|
from fastapi.responses import HTMLResponse
|
|
from fastapi.staticfiles import StaticFiles
|
|
from transformers import pipeline
|
|
import textwrap
|
|
import fitz
|
|
from docx import Document
|
|
import openpyxl
|
|
from pptx import Presentation
|
|
from functools import lru_cache
|
|
import os
|
|
|
|
|
|
app = FastAPI()
|
|
|
|
|
|
STATIC_DIR = r"C:\Users\User\doc_translation_service\translation\static"
|
|
|
|
|
|
if not os.path.exists(STATIC_DIR):
|
|
os.makedirs(STATIC_DIR)
|
|
|
|
|
|
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
|
|
|
@app.get("/", response_class=HTMLResponse)
|
|
async def read_root():
|
|
index_path = os.path.join(STATIC_DIR, "index.html")
|
|
try:
|
|
with open(index_path, "r", encoding="utf-8") as file:
|
|
return HTMLResponse(content=file.read())
|
|
except FileNotFoundError:
|
|
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
|
|
|
|
|
|
LANGUAGE_CODES = {
|
|
"Anglais": "en",
|
|
"Francais": "fr",
|
|
"Arabe": "ar",
|
|
"Espagnol": "es",
|
|
}
|
|
|
|
|
|
AVAILABLE_MODELS = {
|
|
"fr-en": "Helsinki-NLP/opus-mt-fr-en",
|
|
"en-fr": "Helsinki-NLP/opus-mt-en-fr",
|
|
"ar-en": "Helsinki-NLP/opus-mt-ar-en",
|
|
"en-ar": "Helsinki-NLP/opus-mt-en-ar",
|
|
"es-en": "Helsinki-NLP/opus-mt-es-en",
|
|
"en-es": "Helsinki-NLP/opus-mt-en-es",
|
|
}
|
|
|
|
|
|
@lru_cache(maxsize=10)
|
|
def load_translator(src_code: str, tgt_code: str):
|
|
model_key = f"{src_code}-{tgt_code}"
|
|
|
|
if model_key in AVAILABLE_MODELS:
|
|
return pipeline("translation", model=AVAILABLE_MODELS[model_key])
|
|
|
|
elif src_code != "en" and tgt_code != "en":
|
|
return (
|
|
pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
|
|
pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
|
|
)
|
|
|
|
else:
|
|
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
|
|
|
|
|
|
def chunk_text(text, max_length=400):
|
|
return textwrap.wrap(text, max_length)
|
|
|
|
|
|
def extract_text(file: UploadFile):
|
|
try:
|
|
if file.filename.endswith(".txt"):
|
|
return file.file.read().decode("utf-8")
|
|
|
|
elif file.filename.endswith(".pdf"):
|
|
doc = fitz.open(stream=file.file.read(), filetype="pdf")
|
|
return "\n".join([page.get_text() for page in doc])
|
|
|
|
elif file.filename.endswith(".docx"):
|
|
doc = Document(file.file)
|
|
return "\n".join([para.text for para in doc.paragraphs])
|
|
|
|
elif file.filename.endswith(".xlsx"):
|
|
wb = openpyxl.load_workbook(file.file)
|
|
text = ""
|
|
for sheet in wb.sheetnames:
|
|
ws = wb[sheet]
|
|
for row in ws.iter_rows():
|
|
text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
|
|
return text
|
|
|
|
elif file.filename.endswith(".pptx"):
|
|
prs = Presentation(file.file)
|
|
text = ""
|
|
for slide in prs.slides:
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text"):
|
|
text += shape.text + "\n"
|
|
return text
|
|
|
|
else:
|
|
raise HTTPException(status_code=400, detail="File type not supported.")
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
|
|
|
|
@app.post("/upload/")
|
|
async def upload_file(
|
|
file: UploadFile = File(...),
|
|
src_lang: str = Form(...),
|
|
tgt_lang: str = Form(...)
|
|
):
|
|
text = extract_text(file)
|
|
|
|
if not text.strip():
|
|
raise HTTPException(status_code=400, detail="No text extracted from the file.")
|
|
|
|
src_code = LANGUAGE_CODES.get(src_lang)
|
|
tgt_code = LANGUAGE_CODES.get(tgt_lang)
|
|
|
|
if not src_code or not tgt_code:
|
|
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
|
|
|
|
try:
|
|
|
|
translator = load_translator(src_code, tgt_code)
|
|
|
|
|
|
if isinstance(translator, tuple):
|
|
translator1, translator2 = translator
|
|
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
|
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
|
|
|
|
else:
|
|
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
|
|
|
return {"translated_text": translated_text}
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
|
|
|