Spaces:

openpecha
/

translation_glossary_checker

Running

File size: 7,860 Bytes
import json
from pathlib import Path
from xml.dom import minidom
from xml.etree.ElementTree import Element, SubElement, tostring


class GlossaryChecker:
    def __init__(self, glossary_path):
        self.glossary = self._load_glossary(glossary_path)
        self._build_term_mappings()

    def _load_glossary(self, path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

    def _normalize_tibetan_term(self, text):
        """Normalize Tibetan text by removing common punctuation."""
        text = text.replace("།", "")
        if text.endswith("་"):
            text = text[:-1]
        return text

    def get_tibetan_syllables(self, text):
        """Split Tibetan text into syllables."""
        text = text.replace("།", "")
        syllables = []
        for chunk in text.split():
            chunk = chunk.strip()
            syllables.extend(chunk.split("་"))
        return syllables

    def _build_term_mappings(self):
        """Build mappings for terms, including their semantic categories and definitions."""
        self.term_info = {}  # Store complete term information
        self.terms = set()  # Normalized terms for matching

        for term, data in self.glossary.items():
            normalized_term = self._normalize_tibetan_term(term)
            self.terms.add(normalized_term)

            # Initialize term info with original form
            self.term_info[normalized_term] = {"original_term": term, "categories": {}}

            # Store data by semantic category
            for category, cat_data in data.items():
                if isinstance(cat_data, dict):
                    self.term_info[normalized_term]["categories"][category] = {
                        "translations": cat_data.get("translations", []),
                        "definitions": cat_data.get("definitions", []),
                    }

    def extract_terms(self, text):
        """Extract terms based on Tibetan syllable matching."""
        text_syllables = self.get_tibetan_syllables(text)
        found_terms = []

        i = 0
        while i < len(text_syllables):
            longest_match = None
            for j in range(len(text_syllables), i, -1):
                possible_term = "་".join(text_syllables[i:j])
                if possible_term in self.terms:
                    longest_match = possible_term
                    break

            if longest_match:
                found_terms.append(longest_match)
                i += len(longest_match.split("་"))
            else:
                i += 1

        return found_terms

    def check(self, source_text, translation_text):
        """Check source text and translation against the glossary with category information."""
        results = []
        found_terms = self.extract_terms(source_text)

        for term in found_terms:
            term_data = self.term_info[term]

            result = {
                "source_term": term_data["original_term"],
                "normalized_term": term,
                "categories": {},
                "found_in_source": True,
                "found_in_translation": False,
            }

            # Check translations for each semantic category
            for category, cat_data in term_data["categories"].items():
                result["categories"][category] = {
                    "translations": cat_data["translations"],
                    "definitions": cat_data["definitions"],
                    "translation_found": False,
                }

                # Check if any expected translations appear
                for trans in cat_data["translations"]:
                    if trans in translation_text:
                        result["categories"][category]["translation_found"] = True
                        result["found_in_translation"] = True
                        break

            results.append(result)

        return results

    def results_to_xml(self, results, source_text, translation_text, pretty_print=True):
        """Convert checker results to XML format.

        Args:
            results: List of result dictionaries from check()
            source_text: Original source text that was checked
            translation_text: Translation text that was checked
            pretty_print: Whether to format the XML with proper indentation

        Returns:
            str: XML string representation of the results
        """
        # Create root element
        root = Element("glossary_check")

        # Add text information
        texts = SubElement(root, "texts")
        source = SubElement(texts, "source")
        source.text = source_text
        translation = SubElement(texts, "translation")
        translation.text = translation_text

        # Add found terms
        terms = SubElement(root, "terms")

        for result in results:
            term = SubElement(terms, "term")

            # Add term information
            source_term = SubElement(term, "source_term")
            source_term.text = result["source_term"]

            norm_term = SubElement(term, "normalized_term")
            norm_term.text = result["normalized_term"]

            found_status = SubElement(term, "found_status")
            SubElement(found_status, "in_source").text = str(result["found_in_source"])
            SubElement(found_status, "in_translation").text = str(
                result["found_in_translation"]
            )

            # Add categories
            categories = SubElement(term, "categories")
            for cat_name, cat_data in result["categories"].items():
                category = SubElement(categories, "category")
                category.set("type", cat_name)

                # Add translations
                translations = SubElement(category, "translations")
                translations.set("found", str(cat_data["translation_found"]))
                for trans in cat_data["translations"]:
                    trans_elem = SubElement(translations, "translation")
                    trans_elem.text = trans

                # Add definitions
                definitions = SubElement(category, "definitions")
                for defn in cat_data["definitions"]:
                    defn_elem = SubElement(definitions, "definition")
                    defn_elem.text = defn

        # Convert to string with pretty printing if requested
        if pretty_print:
            xml_str = minidom.parseString(
                tostring(root, encoding="unicode")
            ).toprettyxml(indent="  ")
            # Remove empty lines from pretty printed output
            xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
            return xml_str

        return tostring(root, encoding="unicode")


# Example usage:
if __name__ == "__main__":
    glossary_path = Path(__file__).parent / "data" / "84000_glossary.json"
    checker = GlossaryChecker(glossary_path)

    source = "བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །"
    translation = "I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows."

    # Get check results
    results = checker.check(source, translation)

    # Convert to XML and print
    xml_output = checker.results_to_xml(results, source, translation)
    print(xml_output)