Spaces:

Projetweb
/

Ai-prject

Runtime error

App Files Files Community

Ai-prject / translation /static /main.py

rayhane123

Upload 10 files

6c95da8 verified 20 days ago

raw

history blame

5.07 kB

	from fastapi import FastAPI, File, UploadFile, HTTPException, Form
	from fastapi.responses import HTMLResponse
	from fastapi.staticfiles import StaticFiles
	from transformers import pipeline
	import textwrap
	import fitz # PyMuPDF for PDF handling
	from docx import Document
	import openpyxl # For Excel
	from pptx import Presentation
	from functools import lru_cache
	import os

	# Initialize FastAPI app
	app = FastAPI()

	# Set the correct path for static files
	STATIC_DIR = r"C:\Users\User\doc_translation_service\translation\static"

	# Ensure the static directory exists
	if not os.path.exists(STATIC_DIR):
	os.makedirs(STATIC_DIR)

	# Mount static files (serves index.html)
	app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")

	@app.get("/", response_class=HTMLResponse)
	async def read_root():
	index_path = os.path.join(STATIC_DIR, "index.html")
	try:
	with open(index_path, "r", encoding="utf-8") as file:
	return HTMLResponse(content=file.read())
	except FileNotFoundError:
	raise HTTPException(status_code=404, detail="index.html not found in static folder.")

	# Language codes (ISO 639-1)
	LANGUAGE_CODES = {
	"Anglais": "en",
	"Francais": "fr",
	"Arabe": "ar",
	"Espagnol": "es",
	}

	# Available translation models
	AVAILABLE_MODELS = {
	"fr-en": "Helsinki-NLP/opus-mt-fr-en",
	"en-fr": "Helsinki-NLP/opus-mt-en-fr",
	"ar-en": "Helsinki-NLP/opus-mt-ar-en",
	"en-ar": "Helsinki-NLP/opus-mt-en-ar",
	"es-en": "Helsinki-NLP/opus-mt-es-en",
	"en-es": "Helsinki-NLP/opus-mt-en-es",
	}

	# Cache model loading
	@lru_cache(maxsize=10)
	def load_translator(src_code: str, tgt_code: str):
	model_key = f"{src_code}-{tgt_code}"

	if model_key in AVAILABLE_MODELS:
	return pipeline("translation", model=AVAILABLE_MODELS[model_key])

	elif src_code != "en" and tgt_code != "en":
	return (
	pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
	pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
	)

	else:
	raise ValueError(f"No model available for {src_code} -> {tgt_code}")

	# Split text into chunks
	def chunk_text(text, max_length=400):
	return textwrap.wrap(text, max_length)

	# Extract text based on file type
	def extract_text(file: UploadFile):
	try:
	if file.filename.endswith(".txt"):
	return file.file.read().decode("utf-8")

	elif file.filename.endswith(".pdf"):
	doc = fitz.open(stream=file.file.read(), filetype="pdf")
	return "\n".join([page.get_text() for page in doc])

	elif file.filename.endswith(".docx"):
	doc = Document(file.file)
	return "\n".join([para.text for para in doc.paragraphs])

	elif file.filename.endswith(".xlsx"):
	wb = openpyxl.load_workbook(file.file)
	text = ""
	for sheet in wb.sheetnames:
	ws = wb[sheet]
	for row in ws.iter_rows():
	text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
	return text

	elif file.filename.endswith(".pptx"):
	prs = Presentation(file.file)
	text = ""
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text += shape.text + "\n"
	return text

	else:
	raise HTTPException(status_code=400, detail="File type not supported.")

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")

	@app.post("/upload/")
	async def upload_file(
	file: UploadFile = File(...),
	src_lang: str = Form(...),
	tgt_lang: str = Form(...)
	):
	text = extract_text(file)

	if not text.strip():
	raise HTTPException(status_code=400, detail="No text extracted from the file.")

	src_code = LANGUAGE_CODES.get(src_lang)
	tgt_code = LANGUAGE_CODES.get(tgt_lang)

	if not src_code or not tgt_code:
	raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")

	try:
	# Load translation model
	translator = load_translator(src_code, tgt_code)

	# If indirect translation via English
	if isinstance(translator, tuple):
	translator1, translator2 = translator
	intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
	translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])

	else:
	translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])

	return {"translated_text": translated_text}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")