File size: 7,860 Bytes
2290099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import json
from pathlib import Path
from xml.dom import minidom
from xml.etree.ElementTree import Element, SubElement, tostring


class GlossaryChecker:
    def __init__(self, glossary_path):
        self.glossary = self._load_glossary(glossary_path)
        self._build_term_mappings()

    def _load_glossary(self, path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

    def _normalize_tibetan_term(self, text):
        """Normalize Tibetan text by removing common punctuation."""
        text = text.replace("།", "")
        if text.endswith("་"):
            text = text[:-1]
        return text

    def get_tibetan_syllables(self, text):
        """Split Tibetan text into syllables."""
        text = text.replace("།", "")
        syllables = []
        for chunk in text.split():
            chunk = chunk.strip()
            syllables.extend(chunk.split("་"))
        return syllables

    def _build_term_mappings(self):
        """Build mappings for terms, including their semantic categories and definitions."""
        self.term_info = {}  # Store complete term information
        self.terms = set()  # Normalized terms for matching

        for term, data in self.glossary.items():
            normalized_term = self._normalize_tibetan_term(term)
            self.terms.add(normalized_term)

            # Initialize term info with original form
            self.term_info[normalized_term] = {"original_term": term, "categories": {}}

            # Store data by semantic category
            for category, cat_data in data.items():
                if isinstance(cat_data, dict):
                    self.term_info[normalized_term]["categories"][category] = {
                        "translations": cat_data.get("translations", []),
                        "definitions": cat_data.get("definitions", []),
                    }

    def extract_terms(self, text):
        """Extract terms based on Tibetan syllable matching."""
        text_syllables = self.get_tibetan_syllables(text)
        found_terms = []

        i = 0
        while i < len(text_syllables):
            longest_match = None
            for j in range(len(text_syllables), i, -1):
                possible_term = "་".join(text_syllables[i:j])
                if possible_term in self.terms:
                    longest_match = possible_term
                    break

            if longest_match:
                found_terms.append(longest_match)
                i += len(longest_match.split("་"))
            else:
                i += 1

        return found_terms

    def check(self, source_text, translation_text):
        """Check source text and translation against the glossary with category information."""
        results = []
        found_terms = self.extract_terms(source_text)

        for term in found_terms:
            term_data = self.term_info[term]

            result = {
                "source_term": term_data["original_term"],
                "normalized_term": term,
                "categories": {},
                "found_in_source": True,
                "found_in_translation": False,
            }

            # Check translations for each semantic category
            for category, cat_data in term_data["categories"].items():
                result["categories"][category] = {
                    "translations": cat_data["translations"],
                    "definitions": cat_data["definitions"],
                    "translation_found": False,
                }

                # Check if any expected translations appear
                for trans in cat_data["translations"]:
                    if trans in translation_text:
                        result["categories"][category]["translation_found"] = True
                        result["found_in_translation"] = True
                        break

            results.append(result)

        return results

    def results_to_xml(self, results, source_text, translation_text, pretty_print=True):
        """Convert checker results to XML format.

        Args:
            results: List of result dictionaries from check()
            source_text: Original source text that was checked
            translation_text: Translation text that was checked
            pretty_print: Whether to format the XML with proper indentation

        Returns:
            str: XML string representation of the results
        """
        # Create root element
        root = Element("glossary_check")

        # Add text information
        texts = SubElement(root, "texts")
        source = SubElement(texts, "source")
        source.text = source_text
        translation = SubElement(texts, "translation")
        translation.text = translation_text

        # Add found terms
        terms = SubElement(root, "terms")

        for result in results:
            term = SubElement(terms, "term")

            # Add term information
            source_term = SubElement(term, "source_term")
            source_term.text = result["source_term"]

            norm_term = SubElement(term, "normalized_term")
            norm_term.text = result["normalized_term"]

            found_status = SubElement(term, "found_status")
            SubElement(found_status, "in_source").text = str(result["found_in_source"])
            SubElement(found_status, "in_translation").text = str(
                result["found_in_translation"]
            )

            # Add categories
            categories = SubElement(term, "categories")
            for cat_name, cat_data in result["categories"].items():
                category = SubElement(categories, "category")
                category.set("type", cat_name)

                # Add translations
                translations = SubElement(category, "translations")
                translations.set("found", str(cat_data["translation_found"]))
                for trans in cat_data["translations"]:
                    trans_elem = SubElement(translations, "translation")
                    trans_elem.text = trans

                # Add definitions
                definitions = SubElement(category, "definitions")
                for defn in cat_data["definitions"]:
                    defn_elem = SubElement(definitions, "definition")
                    defn_elem.text = defn

        # Convert to string with pretty printing if requested
        if pretty_print:
            xml_str = minidom.parseString(
                tostring(root, encoding="unicode")
            ).toprettyxml(indent="  ")
            # Remove empty lines from pretty printed output
            xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
            return xml_str

        return tostring(root, encoding="unicode")


# Example usage:
if __name__ == "__main__":
    glossary_path = Path(__file__).parent / "data" / "84000_glossary.json"
    checker = GlossaryChecker(glossary_path)

    source = "བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །"
    translation = "I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows."

    # Get check results
    results = checker.check(source, translation)

    # Convert to XML and print
    xml_output = checker.results_to_xml(results, source, translation)
    print(xml_output)