File size: 9,647 Bytes
2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 2ba7d76 2290099 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
import json
import os
import time
from pathlib import Path
import gradio as gr
from glossary_checker import GlossaryChecker
from llm_post_editor import LLMTranslationEditor
from trans_validator import TranslationValidator
# Configure paths
GLOSSARIES = {
"84000 Glossary": "data/84000_glossary.json",
}
def load_validate_and_edit(file_obj, selected_glossary, api_key, progress=gr.Progress()):
"""Process translations with progress updates."""
if not api_key or not api_key.startswith("sk-"):
return "Please provide a valid Anthropic API key (starts with 'sk-')"
try:
# Initialize progress tracking
progress(0, desc="Starting processing...")
# Read content from the file
content = file_obj.decode('utf-8')
progress(0.1, desc="File loaded")
# Save content to temporary file
temp_path = "temp_aligned.txt"
with open(temp_path, "w", encoding='utf-8') as f:
f.write(content)
# Count total lines for progress tracking
total_lines = len([line for line in content.split('\n') if line.strip()])
progress(0.15, desc=f"Found {total_lines} lines to process")
# Initialize components
progress(0.2, desc="Initializing validation...")
glossary_path = GLOSSARIES[selected_glossary]
checker = GlossaryChecker(glossary_path)
validator = TranslationValidator(checker, api_key)
# Run validation
progress(0.3, desc="Running validation...")
validation_results = validator.validate_translation(temp_path)
progress(0.6, desc="Validation complete")
# Initialize editor and get edited translations
progress(0.7, desc="Starting post-editing...")
editor = LLMTranslationEditor({"lines": validation_results}, api_key)
edited_translations = editor.post_edit_translations()
progress(0.9, desc="Post-editing complete")
# Create result display
progress(0.95, desc="Generating report...")
markdown_output = []
# Add summary
total_score = sum(r['score'] for r in validation_results) / len(validation_results)
markdown_output.append(f"# Validation Results\n")
markdown_output.append(f"**Overall Score**: {total_score:.2f}%\n")
markdown_output.append("*(Score based on terms counted in scoring)*\n\n")
markdown_output.append(f"**Total Lines**: {len(validation_results)}\n\n")
# Add processing statistics
modified_lines = sum(1 for t in edited_translations if t['modified'])
markdown_output.append("## Processing Statistics\n")
markdown_output.append(f"- Lines Modified: {modified_lines}/{len(validation_results)}\n")
markdown_output.append(f"- Processed at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
# Add detailed results for each line
for idx, (validation, editing) in enumerate(zip(validation_results, edited_translations)):
markdown_output.append(f"## Line {validation['line_number']}\n")
markdown_output.append(f"**Score**: {validation['score']:.2f}%\n")
markdown_output.append(f"**Source**: {validation['source']}\n")
markdown_output.append(f"**Current Translation**: {validation['target']}\n")
# Add edited translation if available and modified
if editing['modified']:
markdown_output.append(f"\n**Post-Edited Translation**: {editing['edited']}\n")
markdown_output.append(f"\n**Editing Notes**: {editing['reasoning']}\n")
if validation['terms']:
# Separate terms into counted and not counted
counted_terms = []
other_terms = []
for term in validation['terms']:
if term['analysis']['translation_assessment']['should_be_counted']:
counted_terms.append(term)
else:
other_terms.append(term)
# Display counted terms in collapsible section
if counted_terms:
markdown_output.append("\n<details>")
markdown_output.append("<summary>π Terms Counted in Scoring</summary>\n")
for term in counted_terms:
analysis = term['analysis']
assessment = analysis['translation_assessment']
markdown_output.append(f"\n#### `{term['source_term']}` {'β
' if assessment['translated_correctly'] else 'β'}\n")
markdown_output.append(f"- Found Translation: **{analysis['translated_as']}**\n")
markdown_output.append(f"- Expected Translation: **{analysis['glossary_translation']}**\n")
# Add categories in collapsible section
markdown_output.append("\n<details>")
markdown_output.append("<summary>Show Categories & Definitions</summary>\n")
for cat_name in analysis['matching_categories']:
cat_data = term['categories'].get(cat_name, {})
markdown_output.append(f"\n*{cat_name}*:\n")
if 'translations' in cat_data:
markdown_output.append(f"- Translations: {', '.join(cat_data['translations'])}\n")
if 'definitions' in cat_data:
markdown_output.append(f"- Definitions: {', '.join(cat_data['definitions'])}\n")
markdown_output.append("</details>\n")
markdown_output.append("</details>\n")
# Display other terms in separate collapsible section
if other_terms:
markdown_output.append("\n<details>")
markdown_output.append("<summary>Terms Not Counted in Scoring</summary>\n")
for term in other_terms:
analysis = term['analysis']
markdown_output.append(f"\n#### `{term['source_term']}`\n")
markdown_output.append(f"- Found Translation: {analysis['translated_as']}\n")
markdown_output.append(f"- Note: Term not counted due to usage context\n")
# Add categories in collapsible section
markdown_output.append("\n<details>")
markdown_output.append("<summary>Show Categories & Definitions</summary>\n")
for cat_name in analysis['matching_categories']:
cat_data = term['categories'].get(cat_name, {})
markdown_output.append(f"\n*{cat_name}*:\n")
if 'translations' in cat_data:
markdown_output.append(f"- Translations: {', '.join(cat_data['translations'])}\n")
if 'definitions' in cat_data:
markdown_output.append(f"- Definitions: {', '.join(cat_data['definitions'])}\n")
markdown_output.append("</details>\n")
markdown_output.append("</details>\n")
markdown_output.append("\n---\n")
else:
markdown_output.append("\n*No glossary terms found in this line*\n\n---\n")
# Clean up temp file
os.remove(temp_path)
progress(1.0, desc="Processing complete!")
return "\n".join(markdown_output)
except Exception as e:
if os.path.exists(temp_path):
os.remove(temp_path)
return f"Error during processing: {str(e)}\n\nPlease check your input file and API key and try again."
# Create Gradio interface with examples
with gr.Blocks() as demo:
gr.Markdown("# Translation Validation & Editing Tool")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload aligned translations file (tab-separated)",
type="binary"
)
glossary_input = gr.Dropdown(
choices=list(GLOSSARIES.keys()),
label="Select Glossary",
value=list(GLOSSARIES.keys())[0]
)
api_key_input = gr.Textbox(
label="Anthropic API Key",
placeholder="sk-...",
type="password"
)
submit_btn = gr.Button("Process Translations", variant="primary")
# Add examples
gr.Examples(
examples=[
[str(Path("data/example_translations.txt").resolve()), "84000 Glossary", "sk-..."],
],
inputs=[file_input, glossary_input, api_key_input],
label="Example Inputs"
)
with gr.Column():
output = gr.Markdown()
gr.Markdown("""### Instructions
1. Upload a tab-separated file with Tibetan source and English translations
2. Select the glossary to use for validation
3. Enter your Anthropic API key
4. Click "Process Translations" and wait for results
The tool will:
- Validate translations against the glossary
- Calculate accuracy scores
- Suggest improvements using Claude
- Show detailed term analysis
Key:
- π Terms used for scoring
- β
Correctly translated terms
- β Terms needing improvement""")
submit_btn.click(
fn=load_validate_and_edit,
inputs=[file_input, glossary_input, api_key_input],
outputs=output
)
if __name__ == "__main__":
demo.launch() |