Spaces:

kleervoyans
/

evaluator

Sleeping

App Files Files Community

kleervoyans commited on May 6

Commit

3755b73

verified ·

1 Parent(s): dc2f97b

Update app.py

Browse files

Files changed (1) hide show

app.py +279 -146

app.py CHANGED Viewed

@@ -4,12 +4,14 @@ import streamlit as st
 import streamlit.components.v1 as components
 import logging
 import torch
 import pandas as pd
 import plotly.express as px
 import time
 import difflib
-from typing import Union, List
 from langdetect import detect, LangDetectException
 from transformers import (
     AutoTokenizer,
@@ -18,73 +20,73 @@ from transformers import (
     BitsAndBytesConfig,
 )
 import evaluate
 # ────────── Global CSS ──────────
-st.markdown(
-    """
-    <style>
-      /* Page */
-      .main .block-container { max-width: 900px; padding: 1rem 2rem; }
-      /* Buttons */
-      .stButton>button { background-color: #4A90E2; color: white; border-radius: 4px; }
-      .stButton>button:hover { background-color: #357ABD; }
-      /* Text areas */
-      textarea { border-radius: 4px; }
-      /* Tables */
-      .stTable table { border-radius: 4px; overflow: hidden; }
-    </style>
-    """,
-    unsafe_allow_html=True
-)
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
-    level=logging.INFO
 )
 logger = logging.getLogger(__name__)
 # ────────── Model Manager ──────────
 class ModelManager:
     """
-    Selects & loads NLLB‐200 or M2M100 (8‐bit if GPU available).
-    Exposes `translate()` with auto‐lang detection + dynamic tgt_lang.
     """
-    def __init__(
-        self,
-        candidates: List[str] = None,
-        quantize: bool = True,
-        default_tgt: str = None,
-    ):
         if quantize and not torch.cuda.is_available():
             logger.warning("CUDA unavailable; disabling 8-bit quantization")
             quantize = False
-        self.quantize    = quantize
-        self.candidates  = candidates or [
             "facebook/nllb-200-distilled-600M",
-            "facebook/m2m100_418M"
         ]
         self.default_tgt = default_tgt
-        self.model_name = None
-        self.tokenizer  = None
-        self.model      = None
-        self.pipeline   = None
-        self.lang_codes = []
         self._load_best()
     def _load_best(self):
         last_err = None
         for name in self.candidates:
             try:
-                # 1) Tokenizer
-                logger.info(f"Loading tokenizer for {name}")
                 tok = AutoTokenizer.from_pretrained(name, use_fast=True)
                 if not hasattr(tok, "lang_code_to_id"):
                     raise AttributeError("no lang_code_to_id")
-                # 2) Model (8-bit if configured)
-                logger.info(f"Loading model {name} (8-bit={self.quantize})")
                 if self.quantize:
                     bnb = BitsAndBytesConfig(load_in_8bit=True)
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
@@ -94,21 +96,19 @@ class ModelManager:
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
                         name, device_map="auto"
                     )
-                # 3) Pipeline
                 pipe = pipeline("translation", model=mdl, tokenizer=tok)
-                # Store
                 self.model_name = name
                 self.tokenizer  = tok
                 self.model      = mdl
                 self.pipeline   = pipe
                 self.lang_codes = list(tok.lang_code_to_id.keys())
-                # Auto‐pick Turkish if needed
                 if not self.default_tgt:
                     tur = [c for c in self.lang_codes if c.lower().startswith("tr")]
                     if not tur:
                         raise ValueError("No Turkish code found")
                     self.default_tgt = tur[0]
-                logger.info(f"Default target = {self.default_tgt}")
                 return
             except Exception as e:
                 logger.warning(f"Failed to load {name}: {e}")
@@ -116,30 +116,26 @@ class ModelManager:
         raise RuntimeError(f"No model loaded: {last_err}")
     def translate(
-        self,
-        text: Union[str, List[str]],
-        src_lang: str = None,
-        tgt_lang: str = None,
     ):
         tgt = tgt_lang or self.default_tgt
-        # auto‐detect source if missing
         if not src_lang:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
-                cands = [c for c in self.lang_codes if c.lower().startswith(iso)]
-                if not cands: raise LangDetectException()
-                exact = [c for c in cands if c.lower() == iso]
-                src = exact[0] if exact else cands[0]
                 logger.info(f"Detected src_lang={src}")
             except Exception:
-                # fallback to English
                 eng = [c for c in self.lang_codes if c.lower().startswith("en")]
                 src = eng[0] if eng else self.lang_codes[0]
                 logger.warning(f"Falling back src_lang={src}")
         else:
             src = src_lang
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
@@ -152,79 +148,185 @@ class ModelManager:
             "quantized":   self.quantize,
             "device":      dev,
             "default_tgt": self.default_tgt,
         }
 # ────────── Evaluator ──────────
 class TranslationEvaluator:
     def __init__(self):
-        self.bleu      = evaluate.load("bleu")
         self.bertscore = evaluate.load("bertscore")
-        self.comet     = evaluate.load("comet", model_id="unbabel/wmt22-comet-da")
-        logger.info("Loaded BLEU, BERTScore, COMET")
-    def evaluate(
         self,
-        srcs: List[str],
-        refs: List[str],
-        hyps: List[str],
-    ):
         out = {}
-        # BLEU
-        b = self.bleu.compute(predictions=hyps, references=[[r] for r in refs])
-        out["BLEU"] = float(b.get("bleu", 0.0))
-        # BERTScore xx
-        bs = self.bertscore.compute(predictions=hyps, references=refs, lang="xx")
-        f1 = bs.get("f1", [])
-        out["BERTScore"] = float(sum(f1)/len(f1)) if f1 else 0.0
-        # BERTurk tr
-        bt = self.bertscore.compute(predictions=hyps, references=refs, lang="tr")
-        f2 = bt.get("f1", [])
-        out["BERTurk"] = float(sum(f2)/len(f2)) if f2 else 0.0
-        # COMET
-        cm = self.comet.compute(srcs=srcs, hyps=hyps, refs=refs)
-        sc = cm.get("scores")
-        out["COMET"] = float(sc[0] if isinstance(sc, list) else sc or 0.0)
         return out
 # ────────── Streamlit App ──────────
 @st.cache_resource
 def load_resources():
     mgr = ModelManager(quantize=True)
     ev  = TranslationEvaluator()
-    return mgr, ev
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
-    st.sidebar.write(f"• Model: **{info['model']}**")
-    st.sidebar.write(f"• Quantized: **{info['quantized']}**")
-    st.sidebar.write(f"• Device: **{info['device']}**")
-def process_and_stream(src, ref, tgt, mgr, ev, metrics):
-    # 1) call pipeline
-    out = mgr.translate(src, tgt_lang=tgt)
-    hyp = out[0]["translation_text"]
-    # 2) pseudo‐stream: reveal word by word
-    placeholder = st.empty()
-    text_acc = ""
-    for w in hyp.split():
-        text_acc += w + " "
-        placeholder.markdown(f"**Hypothesis ({tgt}):**  {text_acc}")
-        time.sleep(0.05)
-    # 3) metrics (only if ref given)
-    scores = {}
-    if ref and ref.strip():
-        scores = ev.evaluate([src], [ref], [hyp])
-    return hyp, scores
-def show_diff(ref, hyp):
-    # side‐by‐side HTML diff
     differ = difflib.HtmlDiff(tabsize=4, wrapcolumn=60)
     html = differ.make_table(
         ref.split(), hyp.split(),
@@ -233,84 +335,115 @@ def show_diff(ref, hyp):
     )
     components.html(html, height=200, scrolling=True)
 def main():
-    st.set_page_config(page_title="🔤 Multi‐Lang ↑TR + Eval", layout="wide")
-    st.title("🌐 Translate → 🔠 Turkish & Evaluate")
-    st.write("Choose target, translate from any language, and (optionally) eval against a reference.")
-    # Sidebar: load models & then dynamic tgt dropdown
     with st.sidebar:
         st.header("Settings")
-        mgr, ev = load_resources()
         info = mgr.get_info()
         display_model_info(info)
         tgt = st.selectbox(
-            "Target language code",
-            options=mgr.lang_codes,
-            index=mgr.lang_codes.index(info["default_tgt"])
-        )
-        metrics = st.multiselect(
-            "Metrics",
-            ["BLEU","BERTScore","BERTurk","COMET"],
-            default=["BLEU","BERTScore","COMET"]
         )
         batch_size = st.slider("Batch size", 1, 32, 8)
-    tab1, tab2 = st.tabs(["Single sentence","Batch CSV"])
     with tab1:
-        src = st.text_area("Source sentence:", height=120)
-        ref = st.text_area("Turkish reference (optional):", height=80)
         if st.button("Translate & Eval"):
-            with st.spinner("Working…"):
-                hyp, scores = process_and_stream(src, ref, tgt, mgr, ev, metrics)
-            # show scores
-            df = {m: (scores.get(m) if ref.strip() else None) for m in metrics}
             st.markdown("### Scores")
-            st.table(pd.DataFrame([df]).replace({None:"N/A"}))
             # diff
             if ref.strip():
-                st.markdown("### Diff view")
                 show_diff(ref, hyp)
     with tab2:
         uploaded = st.file_uploader("Upload CSV with `src`,`ref_tr`", type=["csv"])
         if uploaded:
             df = pd.read_csv(uploaded)
-            if not {"src","ref_tr"}.issubset(df):
-                st.error("CSV needs `src` and `ref_tr` columns.")
             else:
-                with st.spinner("Batch translating…"):
-                    out_rows = []
                     prog = st.progress(0)
-                    for i in range(0, len(df), batch_size):
                         batch = df.iloc[i : i+batch_size]
                         srcs, refs = batch["src"].tolist(), batch["ref_tr"].tolist()
                         outs = mgr.translate(srcs, tgt_lang=tgt)
                         hyps = [o["translation_text"] for o in outs]
                         for s, r, h in zip(srcs, refs, hyps):
-                            row = {"src":s, "ref_tr":r, "hyp_tr":h}
                             if r.strip():
-                                sc = ev.evaluate([s],[r],[h])
-                                for m in metrics: row[m] = sc[m]
                             else:
-                                for m in metrics: row[m] = None
-                            out_rows.append(row)
-                        prog.progress(min(i+batch_size,len(df))/len(df))
-                    res_df = pd.DataFrame(out_rows)
-                st.markdown("### Batch Results")
                 st.dataframe(res_df, use_container_width=True)
-                # viz
                 for m in metrics:
-                    st.markdown(f"#### {m} Histogram")
-                    col = res_df[m].dropna()
                     if col.empty:
-                        st.write("No valid refs → metric N/A.")
                     else:
-                        fig = px.histogram(res_df, x=m)
                         st.plotly_chart(fig, use_container_width=True)
                 st.download_button("Download CSV", res_df.to_csv(index=False), "results.csv")
 if __name__=="__main__":

 import streamlit.components.v1 as components
 import logging
 import torch
+import random
+import numpy as np
 import pandas as pd
 import plotly.express as px
 import time
 import difflib
+from typing import List, Union
 from langdetect import detect, LangDetectException
 from transformers import (
     AutoTokenizer,
     BitsAndBytesConfig,
 )
 import evaluate
+from sacrebleu import corpus_bleu, sentence_bleu  # Doc vs. segment BLEU
 # ────────── Global CSS ──────────
+st.markdown("""
+<style>
+  .main .block-container { max-width: 900px; padding: 1rem 2rem; }
+  .stButton>button { background-color: #4A90E2; color: white; border-radius: 4px; }
+  .stButton>button:hover { background-color: #357ABD; }
+  textarea { border-radius: 4px; }
+  .stTable table { border-radius: 4px; overflow: hidden; }
+</style>
+""", unsafe_allow_html=True)
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
+# ────────── Utilities ──────────
+def bootstrap(
+    fn, predictions: List[str], references: List[str], sources: List[str]=None,
+    n_resamples: int = 200, seed: int = 42
+) -> List[float]:
+    """Bootstrap metric fn over (predictions, references, [sources])."""
+    random.seed(seed)
+    scores = []
+    N = len(predictions)
+    for _ in range(n_resamples):
+        idxs = [random.randrange(N) for _ in range(N)]
+        ps = [predictions[i] for i in idxs]
+        rs = [references[i] for i in idxs]
+        if sources:
+            ss = [sources[i] for i in idxs]
+            scores.append(fn(ps, rs, ss))
+        else:
+            scores.append(fn(ps, rs))
+    return scores
 # ────────── Model Manager ──────────
 class ModelManager:
     """
+    Loads the best translation model (NLLB‐200 or M2M100),
+    8-bit if GPU available; auto-detects src_lang; dynamic tgt_lang.
     """
+    def __init__(self, candidates=None, quantize=True, default_tgt=None):
         if quantize and not torch.cuda.is_available():
             logger.warning("CUDA unavailable; disabling 8-bit quantization")
             quantize = False
+        self.quantize   = quantize
+        self.candidates = candidates or [
             "facebook/nllb-200-distilled-600M",
+            "facebook/m2m100_418M",
         ]
         self.default_tgt = default_tgt
         self._load_best()
     def _load_best(self):
         last_err = None
         for name in self.candidates:
             try:
                 tok = AutoTokenizer.from_pretrained(name, use_fast=True)
                 if not hasattr(tok, "lang_code_to_id"):
                     raise AttributeError("no lang_code_to_id")
+                logger.info(f"Loading {name} (8-bit={self.quantize})")
                 if self.quantize:
                     bnb = BitsAndBytesConfig(load_in_8bit=True)
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
                         name, device_map="auto"
                     )
                 pipe = pipeline("translation", model=mdl, tokenizer=tok)
                 self.model_name = name
                 self.tokenizer  = tok
                 self.model      = mdl
                 self.pipeline   = pipe
                 self.lang_codes = list(tok.lang_code_to_id.keys())
+                # pick default target if none
                 if not self.default_tgt:
                     tur = [c for c in self.lang_codes if c.lower().startswith("tr")]
                     if not tur:
                         raise ValueError("No Turkish code found")
                     self.default_tgt = tur[0]
+                logger.info(f"default_tgt = {self.default_tgt}")
                 return
             except Exception as e:
                 logger.warning(f"Failed to load {name}: {e}")
         raise RuntimeError(f"No model loaded: {last_err}")
     def translate(
+        self, text: Union[str, List[str]],
+        src_lang: str = None, tgt_lang: str = None
     ):
         tgt = tgt_lang or self.default_tgt
+        # auto-detect src
         if not src_lang:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
+                cand = [c for c in self.lang_codes if c.lower().startswith(iso)]
+                if not cand: raise LangDetectException()
+                exact = [c for c in cand if c.lower()==iso]
+                src = exact[0] if exact else cand[0]
                 logger.info(f"Detected src_lang={src}")
             except Exception:
                 eng = [c for c in self.lang_codes if c.lower().startswith("en")]
                 src = eng[0] if eng else self.lang_codes[0]
                 logger.warning(f"Falling back src_lang={src}")
         else:
             src = src_lang
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
             "quantized":   self.quantize,
             "device":      dev,
             "default_tgt": self.default_tgt,
+            "langs":       self.lang_codes,
         }
 # ────────── Evaluator ──────────
 class TranslationEvaluator:
+    """
+    Wraps BLEU (corpus), ChrF, TER, BERTScore, COMET (ref & ref-free), and provides CIs.
+    """
     def __init__(self):
+        # BLEU (corpus)
+        self.bleu = evaluate.load("bleu")
+        # ChrF :contentReference[oaicite:0]{index=0}
+        self.chrf = evaluate.load("chrf")
+        # TER :contentReference[oaicite:1]{index=1}
+        self.ter  = evaluate.load("ter")
+        # BERTScore
         self.bertscore = evaluate.load("bertscore")
+        # COMET (ref-based)
+        self.comet_ref = evaluate.load("comet", model_id="unbabel/comet-mqm-qe-da")
+        # COMET QE (ref-free) :contentReference[oaicite:2]{index=2}
+        self.comet_qe  = evaluate.load("comet", model_id="unbabel/wmt20-comet-qe-da")
+        logger.info("Loaded BLEU, ChrF, TER, BERTScore, COMET (ref & QE)")
+    def compute_metrics(
         self,
+        sources: List[str],
+        references: List[str],
+        predictions: List[str],
+        metrics: List[str],
+        ci: bool = True
+    ) -> dict:
         out = {}
+        # -- BLEU (document-level)
+        if "BLEU_doc" in metrics:
+            doc_bleu = self.bleu.compute(
+                predictions=predictions,
+                references=[[r] for r in references]
+            )["bleu"]
+            out["BLEU_doc"] = float(doc_bleu)
+        # -- BLEU (segment-level avg)
+        if "BLEU_seg" in metrics:
+            seg_scores = [
+                sentence_bleu([r], p).score
+                for p, r in zip(predictions, references)
+            ]
+            out["BLEU_seg"] = float(sum(seg_scores) / len(seg_scores))
+        # -- ChrF
+        if "ChrF" in metrics:
+            cf = self.chrf.compute(
+                predictions=predictions,
+                references=[[r] for r in references]
+            )["score"]
+            out["ChrF"] = float(cf)
+        # -- TER
+        if "TER" in metrics:
+            tr = self.ter.compute(
+                predictions=predictions,
+                references=[[r] for r in references],
+                normalized=True
+            )["score"]
+            out["TER"] = float(tr)
+        # -- BERTScore
+        if "BERTScore" in metrics:
+            bs = self.bertscore.compute(
+                predictions=predictions,
+                references=references,
+                lang="xx"
+            )["f1"]
+            out["BERTScore"] = float(sum(bs) / len(bs)) if bs else 0.0
+        # -- BERTurk
+        if "BERTurk" in metrics:
+            bt = self.bertscore.compute(
+                predictions=predictions,
+                references=references,
+                lang="tr"
+            )["f1"]
+            out["BERTurk"] = float(sum(bt) / len(bt)) if bt else 0.0
+        # -- COMET (ref-based)
+        if "COMET" in metrics:
+            cr = self.comet_ref.compute(
+                srcs=sources, hyps=predictions, refs=references
+            ).get("scores", 0.0)
+            out["COMET"] = float(cr[0] if isinstance(cr, list) else cr)
+        # -- QE (ref-free)
+        if "QE" in metrics:
+            cq = self.comet_qe.compute(
+                srcs=sources, hyps=predictions
+            ).get("scores", 0.0)
+            out["QE"] = float(cq[0] if isinstance(cq, list) else cq)
+        # -- Bootstrap CIs
+        if ci:
+            # BLEU_doc CI
+            if "CI_BLEU_doc" in metrics:
+                bsamp = bootstrap(
+                    lambda ps, rs: self.bleu.compute(
+                        predictions=ps,
+                        references=[[r] for r in rs]
+                    )["bleu"],
+                    predictions, references
+                )
+                out["CI_BLEU_doc"] = (
+                    float(np.percentile(bsamp, 2.5)),
+                    float(np.percentile(bsamp, 97.5))
+                )
+            # BERTScore CI
+            if "CI_BERTScore" in metrics:
+                bsamp = bootstrap(
+                    lambda ps, rs: sum(
+                        self.bertscore.compute(
+                            predictions=ps, references=rs, lang="xx"
+                        )["f1"]
+                    ) / len(ps),
+                    predictions, references
+                )
+                out["CI_BERTScore"] = (
+                    float(np.percentile(bsamp, 2.5)),
+                    float(np.percentile(bsamp, 97.5))
+                )
+            # COMET CI
+            if "CI_COMET" in metrics:
+                bsamp = bootstrap(
+                    lambda ps, rs, ss: float(
+                        self.comet_ref.compute(
+                            srcs=ss, hyps=ps, refs=rs
+                        ).get("scores", [0.0])[0]
+                    ),
+                    predictions, references, sources
+                )
+                out["CI_COMET"] = (
+                    float(np.percentile(bsamp, 2.5)),
+                    float(np.percentile(bsamp, 97.5))
+                )
         return out
+# ────────── Error Categorizer ──────────
+class ErrorCategorizer:
+    """
+    Optional: classify error types via a fine-tuned text-classification model.
+    Supply your own HF model name for real categories.
+    """
+    def __init__(self, model_name: str = None):
+        if model_name:
+            self.pipe = pipeline("text-classification", model=model_name, device=0 if torch.cuda.is_available() else -1)
+        else:
+            self.pipe = None
+    def categorize(self, src: str, hyp: str):
+        if not self.pipe:
+            return []
+        inp = f"SRC: {src}\nHYP: {hyp}\nError types (pick from taxonomy):"
+        return self.pipe(inp, top_k=None)
 # ────────── Streamlit App ──────────
 @st.cache_resource
 def load_resources():
     mgr = ModelManager(quantize=True)
     ev  = TranslationEvaluator()
+    # set your error-classifier HF model here, or None to disable
+    err = ErrorCategorizer(model_name="your-org/translation-error-categorizer")
+    return mgr, ev, err
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
+    st.sidebar.write(f"• **Model:** {info['model']}")
+    st.sidebar.write(f"• **Quantized:** {info['quantized']}")
+    st.sidebar.write(f"• **Device:** {info['device']}")
+    st.sidebar.write(f"• **Default tgt:** {info['default_tgt']}")
+def show_diff(ref: str, hyp: str):
     differ = difflib.HtmlDiff(tabsize=4, wrapcolumn=60)
     html = differ.make_table(
         ref.split(), hyp.split(),
     )
     components.html(html, height=200, scrolling=True)
 def main():
+    st.set_page_config(page_title="🔤 Translate→Eval+", layout="wide")
+    st.title("🌐 Translate → 🔠 Evaluate & Analyze")
+    st.write("Translate from any language, choose target, eval with advanced metrics, and inspect errors.")
+    # Sidebar
     with st.sidebar:
         st.header("Settings")
+        mgr, ev, err = load_resources()
         info = mgr.get_info()
         display_model_info(info)
         tgt = st.selectbox(
+            "Target language", info["langs"],
+            index=info["langs"].index(info["default_tgt"])
         )
+        metric_opts = [
+            "BLEU_doc","BLEU_seg","ChrF","TER",
+            "BERTScore","BERTurk","COMET","QE",
+            "CI_BLEU_doc","CI_BERTScore","CI_COMET"
+        ]
+        metrics = st.multiselect("Metrics & CIs", metric_opts, default=["BLEU_doc","BERTScore","COMET"])
         batch_size = st.slider("Batch size", 1, 32, 8)
+    tab1, tab2 = st.tabs(["Single","Batch CSV"])
+    # ────────── Single Sentence ──────────
     with tab1:
+        src = st.text_area("Source text:", height=120)
+        ref = st.text_area("Gold reference (optional):", height=80)
         if st.button("Translate & Eval"):
+            with st.spinner("⏳ Translating…"):
+                out = mgr.translate(src, tgt_lang=tgt)
+                hyp = out[0]["translation_text"]
+            st.markdown(f"**Hypothesis ({tgt}):**  {hyp}")
+            # metrics
+            scores = ev.compute_metrics([src],[ref or ""],[hyp], metrics)
+            # display
+            sd = {}
+            for m in metrics:
+                v = scores.get(m)
+                if m.startswith("CI_"):
+                    low, high = v
+                    sd[m] = f"{low:.3f} – {high:.3f}"
+                else:
+                    sd[m] = f"{v:.4f}" if v is not None else "N/A"
             st.markdown("### Scores")
+            st.table(pd.DataFrame([sd]))
             # diff
             if ref.strip():
+                st.markdown("### Diff View")
                 show_diff(ref, hyp)
+            # error categories
+            cats = err.categorize(src, hyp)
+            if cats:
+                st.markdown("### Error Categories")
+                st.json(cats)
+    # ────────── Batch CSV ──────────
     with tab2:
         uploaded = st.file_uploader("Upload CSV with `src`,`ref_tr`", type=["csv"])
         if uploaded:
             df = pd.read_csv(uploaded)
+            if not {"src","ref_tr"}.issubset(df.columns):
+                st.error("CSV must have `src` and `ref_tr` columns.")
             else:
+                with st.spinner("⏳ Batch processing…"):
+                    all_rows = []
                     prog = st.progress(0)
+                    N = len(df)
+                    for i in range(0, N, batch_size):
                         batch = df.iloc[i : i+batch_size]
                         srcs, refs = batch["src"].tolist(), batch["ref_tr"].tolist()
                         outs = mgr.translate(srcs, tgt_lang=tgt)
                         hyps = [o["translation_text"] for o in outs]
                         for s, r, h in zip(srcs, refs, hyps):
+                            base = {"src":s, "ref_tr":r, "hyp_tr":h}
                             if r.strip():
+                                sc = ev.compute_metrics([s],[r],[h], metrics)
+                                for m in metrics:
+                                    if m.startswith("CI_"):
+                                        low, high = sc[m]
+                                        base[m] = f"{low:.3f}–{high:.3f}"
+                                    else:
+                                        base[m] = sc[m]
                             else:
+                                for m in metrics:
+                                    base[m] = None
+                            all_rows.append(base)
+                        prog.progress(min(i+batch_size, N)/N)
+                    res_df = pd.DataFrame(all_rows)
+                st.markdown("### Results")
                 st.dataframe(res_df, use_container_width=True)
+                # histograms
                 for m in metrics:
+                    st.markdown(f"#### {m} Distribution")
+                    col = pd.to_numeric(res_df[m], errors="coerce").dropna()
                     if col.empty:
+                        st.write("No valid data for this metric.")
                     else:
+                        fig = px.histogram(col, x=col)
                         st.plotly_chart(fig, use_container_width=True)
                 st.download_button("Download CSV", res_df.to_csv(index=False), "results.csv")
 if __name__=="__main__":