|
import json |
|
import os |
|
import time |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
|
|
from glossary_checker import GlossaryChecker |
|
from llm_post_editor import LLMTranslationEditor |
|
from trans_validator import TranslationValidator |
|
|
|
|
|
GLOSSARIES = { |
|
"84000 Glossary": "data/84000_glossary.json", |
|
} |
|
|
|
|
|
def load_validate_and_edit(file_obj, selected_glossary, api_key, progress=gr.Progress()): |
|
"""Process translations with progress updates.""" |
|
if not api_key or not api_key.startswith("sk-"): |
|
return "Please provide a valid Anthropic API key (starts with 'sk-')" |
|
|
|
try: |
|
|
|
progress(0, desc="Starting processing...") |
|
|
|
|
|
content = file_obj.decode('utf-8') |
|
progress(0.1, desc="File loaded") |
|
|
|
|
|
temp_path = "temp_aligned.txt" |
|
with open(temp_path, "w", encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
|
|
total_lines = len([line for line in content.split('\n') if line.strip()]) |
|
progress(0.15, desc=f"Found {total_lines} lines to process") |
|
|
|
|
|
progress(0.2, desc="Initializing validation...") |
|
glossary_path = GLOSSARIES[selected_glossary] |
|
checker = GlossaryChecker(glossary_path) |
|
validator = TranslationValidator(checker, api_key) |
|
|
|
|
|
progress(0.3, desc="Running validation...") |
|
validation_results = validator.validate_translation(temp_path) |
|
progress(0.6, desc="Validation complete") |
|
|
|
|
|
progress(0.7, desc="Starting post-editing...") |
|
editor = LLMTranslationEditor({"lines": validation_results}, api_key) |
|
edited_translations = editor.post_edit_translations() |
|
progress(0.9, desc="Post-editing complete") |
|
|
|
|
|
progress(0.95, desc="Generating report...") |
|
markdown_output = [] |
|
|
|
|
|
total_score = sum(r['score'] for r in validation_results) / len(validation_results) |
|
markdown_output.append(f"# Validation Results\n") |
|
markdown_output.append(f"**Overall Score**: {total_score:.2f}%\n") |
|
markdown_output.append("*(Score based on terms counted in scoring)*\n\n") |
|
markdown_output.append(f"**Total Lines**: {len(validation_results)}\n\n") |
|
|
|
|
|
modified_lines = sum(1 for t in edited_translations if t['modified']) |
|
markdown_output.append("## Processing Statistics\n") |
|
markdown_output.append(f"- Lines Modified: {modified_lines}/{len(validation_results)}\n") |
|
markdown_output.append(f"- Processed at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n") |
|
|
|
|
|
for idx, (validation, editing) in enumerate(zip(validation_results, edited_translations)): |
|
markdown_output.append(f"## Line {validation['line_number']}\n") |
|
markdown_output.append(f"**Score**: {validation['score']:.2f}%\n") |
|
markdown_output.append(f"**Source**: {validation['source']}\n") |
|
markdown_output.append(f"**Current Translation**: {validation['target']}\n") |
|
|
|
|
|
if editing['modified']: |
|
markdown_output.append(f"\n**Post-Edited Translation**: {editing['edited']}\n") |
|
markdown_output.append(f"\n**Editing Notes**: {editing['reasoning']}\n") |
|
|
|
if validation['terms']: |
|
|
|
counted_terms = [] |
|
other_terms = [] |
|
|
|
for term in validation['terms']: |
|
if term['analysis']['translation_assessment']['should_be_counted']: |
|
counted_terms.append(term) |
|
else: |
|
other_terms.append(term) |
|
|
|
|
|
if counted_terms: |
|
markdown_output.append("\n<details>") |
|
markdown_output.append("<summary>π Terms Counted in Scoring</summary>\n") |
|
|
|
for term in counted_terms: |
|
analysis = term['analysis'] |
|
assessment = analysis['translation_assessment'] |
|
|
|
markdown_output.append(f"\n#### `{term['source_term']}` {'β
' if assessment['translated_correctly'] else 'β'}\n") |
|
markdown_output.append(f"- Found Translation: **{analysis['translated_as']}**\n") |
|
markdown_output.append(f"- Expected Translation: **{analysis['glossary_translation']}**\n") |
|
|
|
|
|
markdown_output.append("\n<details>") |
|
markdown_output.append("<summary>Show Categories & Definitions</summary>\n") |
|
|
|
for cat_name in analysis['matching_categories']: |
|
cat_data = term['categories'].get(cat_name, {}) |
|
markdown_output.append(f"\n*{cat_name}*:\n") |
|
if 'translations' in cat_data: |
|
markdown_output.append(f"- Translations: {', '.join(cat_data['translations'])}\n") |
|
if 'definitions' in cat_data: |
|
markdown_output.append(f"- Definitions: {', '.join(cat_data['definitions'])}\n") |
|
|
|
markdown_output.append("</details>\n") |
|
|
|
markdown_output.append("</details>\n") |
|
|
|
|
|
if other_terms: |
|
markdown_output.append("\n<details>") |
|
markdown_output.append("<summary>Terms Not Counted in Scoring</summary>\n") |
|
|
|
for term in other_terms: |
|
analysis = term['analysis'] |
|
markdown_output.append(f"\n#### `{term['source_term']}`\n") |
|
markdown_output.append(f"- Found Translation: {analysis['translated_as']}\n") |
|
markdown_output.append(f"- Note: Term not counted due to usage context\n") |
|
|
|
|
|
markdown_output.append("\n<details>") |
|
markdown_output.append("<summary>Show Categories & Definitions</summary>\n") |
|
|
|
for cat_name in analysis['matching_categories']: |
|
cat_data = term['categories'].get(cat_name, {}) |
|
markdown_output.append(f"\n*{cat_name}*:\n") |
|
if 'translations' in cat_data: |
|
markdown_output.append(f"- Translations: {', '.join(cat_data['translations'])}\n") |
|
if 'definitions' in cat_data: |
|
markdown_output.append(f"- Definitions: {', '.join(cat_data['definitions'])}\n") |
|
|
|
markdown_output.append("</details>\n") |
|
|
|
markdown_output.append("</details>\n") |
|
|
|
markdown_output.append("\n---\n") |
|
else: |
|
markdown_output.append("\n*No glossary terms found in this line*\n\n---\n") |
|
|
|
|
|
os.remove(temp_path) |
|
progress(1.0, desc="Processing complete!") |
|
|
|
return "\n".join(markdown_output) |
|
|
|
except Exception as e: |
|
if os.path.exists(temp_path): |
|
os.remove(temp_path) |
|
return f"Error during processing: {str(e)}\n\nPlease check your input file and API key and try again." |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Translation Validation & Editing Tool") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
file_input = gr.File( |
|
label="Upload aligned translations file (tab-separated)", |
|
type="binary" |
|
) |
|
glossary_input = gr.Dropdown( |
|
choices=list(GLOSSARIES.keys()), |
|
label="Select Glossary", |
|
value=list(GLOSSARIES.keys())[0] |
|
) |
|
api_key_input = gr.Textbox( |
|
label="Anthropic API Key", |
|
placeholder="sk-...", |
|
type="password" |
|
) |
|
submit_btn = gr.Button("Process Translations", variant="primary") |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
[str(Path("data/example_translations.txt").resolve()), "84000 Glossary", "sk-..."], |
|
], |
|
inputs=[file_input, glossary_input, api_key_input], |
|
label="Example Inputs" |
|
) |
|
|
|
with gr.Column(): |
|
output = gr.Markdown() |
|
|
|
gr.Markdown("""### Instructions |
|
1. Upload a tab-separated file with Tibetan source and English translations |
|
2. Select the glossary to use for validation |
|
3. Enter your Anthropic API key |
|
4. Click "Process Translations" and wait for results |
|
|
|
The tool will: |
|
- Validate translations against the glossary |
|
- Calculate accuracy scores |
|
- Suggest improvements using Claude |
|
- Show detailed term analysis |
|
|
|
Key: |
|
- π Terms used for scoring |
|
- β
Correctly translated terms |
|
- β Terms needing improvement""") |
|
|
|
submit_btn.click( |
|
fn=load_validate_and_edit, |
|
inputs=[file_input, glossary_input, api_key_input], |
|
outputs=output |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |