File size: 5,070 Bytes
6c95da8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from transformers import pipeline
import textwrap
import fitz  # PyMuPDF for PDF handling
from docx import Document
import openpyxl  # For Excel
from pptx import Presentation
from functools import lru_cache
import os

# Initialize FastAPI app
app = FastAPI()

# Set the correct path for static files
STATIC_DIR = r"C:\Users\User\doc_translation_service\translation\static"

# Ensure the static directory exists
if not os.path.exists(STATIC_DIR):
    os.makedirs(STATIC_DIR)

# Mount static files (serves index.html)
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")

@app.get("/", response_class=HTMLResponse)
async def read_root():
    index_path = os.path.join(STATIC_DIR, "index.html")
    try:
        with open(index_path, "r", encoding="utf-8") as file:
            return HTMLResponse(content=file.read())
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail="index.html not found in static folder.")

# Language codes (ISO 639-1)
LANGUAGE_CODES = {
    "Anglais": "en",
    "Francais": "fr",
    "Arabe": "ar",
    "Espagnol": "es",
}

# Available translation models
AVAILABLE_MODELS = {
    "fr-en": "Helsinki-NLP/opus-mt-fr-en",
    "en-fr": "Helsinki-NLP/opus-mt-en-fr",
    "ar-en": "Helsinki-NLP/opus-mt-ar-en",
    "en-ar": "Helsinki-NLP/opus-mt-en-ar",
    "es-en": "Helsinki-NLP/opus-mt-es-en",
    "en-es": "Helsinki-NLP/opus-mt-en-es",
}

# Cache model loading
@lru_cache(maxsize=10)
def load_translator(src_code: str, tgt_code: str):
    model_key = f"{src_code}-{tgt_code}"
    
    if model_key in AVAILABLE_MODELS:
        return pipeline("translation", model=AVAILABLE_MODELS[model_key])

    elif src_code != "en" and tgt_code != "en":
        return (
            pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
            pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
        )

    else:
        raise ValueError(f"No model available for {src_code} -> {tgt_code}")

# Split text into chunks
def chunk_text(text, max_length=400):
    return textwrap.wrap(text, max_length)

# Extract text based on file type
def extract_text(file: UploadFile):
    try:
        if file.filename.endswith(".txt"):
            return file.file.read().decode("utf-8")

        elif file.filename.endswith(".pdf"):
            doc = fitz.open(stream=file.file.read(), filetype="pdf")
            return "\n".join([page.get_text() for page in doc])

        elif file.filename.endswith(".docx"):
            doc = Document(file.file)
            return "\n".join([para.text for para in doc.paragraphs])

        elif file.filename.endswith(".xlsx"):
            wb = openpyxl.load_workbook(file.file)
            text = ""
            for sheet in wb.sheetnames:
                ws = wb[sheet]
                for row in ws.iter_rows():
                    text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
            return text

        elif file.filename.endswith(".pptx"):
            prs = Presentation(file.file)
            text = ""
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
            return text

        else:
            raise HTTPException(status_code=400, detail="File type not supported.")

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")

@app.post("/upload/")
async def upload_file(

    file: UploadFile = File(...), 

    src_lang: str = Form(...), 

    tgt_lang: str = Form(...)

):
    text = extract_text(file)

    if not text.strip():
        raise HTTPException(status_code=400, detail="No text extracted from the file.")

    src_code = LANGUAGE_CODES.get(src_lang)
    tgt_code = LANGUAGE_CODES.get(tgt_lang)

    if not src_code or not tgt_code:
        raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")

    try:
        # Load translation model
        translator = load_translator(src_code, tgt_code)

        # If indirect translation via English
        if isinstance(translator, tuple):
            translator1, translator2 = translator
            intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
            translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])

        else:
            translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])

        return {"translated_text": translated_text}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")