Spaces:

kleervoyans
/

evaluator

Sleeping

App Files Files Community

kleervoyans commited on May 5

Commit

7fc686c

verified ·

1 Parent(s): 5ccd1db

Delete models/translation_loader.py

Browse files

Files changed (1) hide show

models/translation_loader.py +0 -114

models/translation_loader.py DELETED Viewed

@@ -1,114 +0,0 @@
-# models/translation_loader.py
-import logging
-from typing import Union, List
-from langdetect import detect, LangDetectException
-from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig
-class TranslationLoader:
-    def __init__(
-        self,
-        model_name: str = "facebook/nllb-200-distilled-600M",
-        quantize: bool = True,
-        tgt_lang: str = None,  # if None, we’ll pick the Turkish code automatically
-    ):
-        self.model_name = model_name
-        self.quantize = quantize
-        self.default_tgt = tgt_lang  # may be None
-        # ─── Load the translation pipeline ───────────────────────────────
-        try:
-            bnb_cfg = BitsAndBytesConfig(load_in_8bit=self.quantize)
-            self.pipeline = pipeline(
-                "translation",
-                model=self.model_name,
-                tokenizer=self.model_name,
-                device_map="auto",
-                quantization_config=bnb_cfg,
-            )
-            logging.info(f"Loaded `{self.model_name}` with 8-bit={self.quantize}")
-        except Exception as e:
-            logging.warning(f"8-bit load failed ({e}); falling back to full-precision")
-            self.pipeline = pipeline(
-                "translation",
-                model=self.model_name,
-                tokenizer=self.model_name,
-                device_map="auto",
-            )
-            logging.info(f"Loaded `{self.model_name}` in full precision")
-        # ─── Load tokenizer & grab the lang_code_to_id mapping ────────────
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
-            logging.info(f"Tokenizer loaded for {self.model_name}")
-        except Exception as e:
-            logging.error(f"Cannot load tokenizer for {self.model_name}: {e}")
-            raise ValueError(f"Failed to load tokenizer: {e}")
-        if hasattr(self.tokenizer, "lang_code_to_id"):
-            self.lang_code_to_id = self.tokenizer.lang_code_to_id
-            logging.info("Using tokenizer.lang_code_to_id mapping")
-        else:
-            allowed = ", ".join(list(self.tokenizer.config.to_dict().keys())[:5])
-            raise AttributeError(
-                f"Model `{self.model_name}`’s tokenizer has no `lang_code_to_id`. "
-                "Use a model like NLLB-200 or M2M100 that supports language codes. "
-                f"(available config keys: {allowed}…)"
-            )
-        # ─── Auto-pick the Turkish target code if none was provided ───────
-        if self.default_tgt is None:
-            tur = [c for c in self.lang_code_to_id if c.lower().startswith("tr")]
-            if not tur:
-                raise ValueError(f"No Turkish code found in mapping for {self.model_name}")
-            self.default_tgt = tur[0]
-        logging.info(f"Default target set to `{self.default_tgt}`")
-    def translate(
-        self,
-        text: Union[str, List[str]],
-        src_lang: str = None,
-        tgt_lang: str = None,
-    ):
-        """
-        - Auto-detects src_lang via langdetect if not given
-        - Uses default_tgt if tgt_lang is not passed
-        - Returns pipeline output (list of dicts with 'translation_text')
-        """
-        tgt = tgt_lang or self.default_tgt
-        # ─── Source-language auto-detection ─────────────────────────────
-        if src_lang:
-            src = src_lang
-        else:
-            sample = text[0] if isinstance(text, list) else text
-            try:
-                iso = detect(sample).lower()
-                # find codes starting with that ISO (e.g. "en"→["en","eng_Latn",…])
-                cand = [c for c in self.lang_code_to_id if c.lower().startswith(iso)]
-                if not cand:
-                    raise LangDetectException(f"No mapping for ISO '{iso}'")
-                # prefer exact match, else first
-                exact = [c for c in cand if c.lower() == iso]
-                src = exact[0] if exact else cand[0]
-                logging.info(f"Detected src_lang={src} from ISO='{iso}'")
-            except Exception as e:
-                logging.warning(f"Language auto-detect failed ({e}); defaulting to English")
-                eng = [c for c in self.lang_code_to_id if c.lower().startswith("en")]
-                src = eng[0] if eng else list(self.lang_code_to_id)[0]
-                logging.info(f"Fallback src_lang={src}")
-        # ─── Perform translation call ────────────────────────────────────
-        return self.pipeline(text, src_lang=src, tgt_lang=tgt)
-    def get_info(self):
-        """Return model metadata for display in your sidebar."""
-        mdl = getattr(self.pipeline, "model", None)
-        q = getattr(mdl, "is_loaded_in_8bit", False)
-        device = getattr(mdl, "device", "auto")
-        return {
-            "model_name": self.model_name,
-            "quantized": q,
-            "device": str(device),
-            "default_target": self.default_tgt,
-        }