Spaces:

Projetweb
/

Ai-prject

Runtime error

File size: 5,070 Bytes

6c95da8

from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
import textwrap
import fitz  # PyMuPDF for PDF handling
from docx import Document
import openpyxl  # For Excel
from pptx import Presentation
from functools import lru_cache
import os

# Initialize FastAPI app
app = FastAPI()

# Set the correct path for static files
STATIC_DIR = r"C:\Users\User\doc_translation_service\translation\static"

# Ensure the static directory exists
if not os.path.exists(STATIC_DIR):
    os.makedirs(STATIC_DIR)

# Mount static files (serves index.html)
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")

@app.get("/", response_class=HTMLResponse)
async def read_root():
    index_path = os.path.join(STATIC_DIR, "index.html")
    try:
        with open(index_path, "r", encoding="utf-8") as file:
            return HTMLResponse(content=file.read())
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail="index.html not found in static folder.")

# Language codes (ISO 639-1)
LANGUAGE_CODES = {
    "Anglais": "en",
    "Francais": "fr",
    "Arabe": "ar",
    "Espagnol": "es",
}

# Available translation models
AVAILABLE_MODELS = {
    "fr-en": "Helsinki-NLP/opus-mt-fr-en",
    "en-fr": "Helsinki-NLP/opus-mt-en-fr",
    "ar-en": "Helsinki-NLP/opus-mt-ar-en",
    "en-ar": "Helsinki-NLP/opus-mt-en-ar",
    "es-en": "Helsinki-NLP/opus-mt-es-en",
    "en-es": "Helsinki-NLP/opus-mt-en-es",
}

# Cache model loading
@lru_cache(maxsize=10)
def load_translator(src_code: str, tgt_code: str):
    model_key = f"{src_code}-{tgt_code}"
    
    if model_key in AVAILABLE_MODELS:
        return pipeline("translation", model=AVAILABLE_MODELS[model_key])

    elif src_code != "en" and tgt_code != "en":
        return (
            pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
            pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
        )

    else:
        raise ValueError(f"No model available for {src_code} -> {tgt_code}")

# Split text into chunks
def chunk_text(text, max_length=400):
    return textwrap.wrap(text, max_length)

# Extract text based on file type
def extract_text(file: UploadFile):
    try:
        if file.filename.endswith(".txt"):
            return file.file.read().decode("utf-8")

        elif file.filename.endswith(".pdf"):
            doc = fitz.open(stream=file.file.read(), filetype="pdf")
            return "\n".join([page.get_text() for page in doc])

        elif file.filename.endswith(".docx"):
            doc = Document(file.file)
            return "\n".join([para.text for para in doc.paragraphs])

        elif file.filename.endswith(".xlsx"):
            wb = openpyxl.load_workbook(file.file)
            text = ""
            for sheet in wb.sheetnames:
                ws = wb[sheet]
                for row in ws.iter_rows():
                    text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
            return text

        elif file.filename.endswith(".pptx"):
            prs = Presentation(file.file)
            text = ""
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
            return text

        else:
            raise HTTPException(status_code=400, detail="File type not supported.")

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")

@app.post("/upload/")
async def upload_file(

    file: UploadFile = File(...), 

    src_lang: str = Form(...), 

    tgt_lang: str = Form(...)

):
    text = extract_text(file)

    if not text.strip():
        raise HTTPException(status_code=400, detail="No text extracted from the file.")

    src_code = LANGUAGE_CODES.get(src_lang)
    tgt_code = LANGUAGE_CODES.get(tgt_lang)

    if not src_code or not tgt_code:
        raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")

    try:
        # Load translation model
        translator = load_translator(src_code, tgt_code)

        # If indirect translation via English
        if isinstance(translator, tuple):
            translator1, translator2 = translator
            intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
            translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])

        else:
            translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])

        return {"translated_text": translated_text}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")