|
|
|
|
|
|
|
|
|
|
|
import re |
|
import gradio as gr |
|
import tempfile |
|
|
|
|
|
mappings = { |
|
"\x04ٲ": "ٲ", |
|
"\x04ُ": "ُ", |
|
"\x04ٚ": "ٚ", |
|
"\x04ٕ": "ٕ", |
|
"\x04ٛ": "ٛ", |
|
"\x04ٔ": "ٔ", |
|
"\x04ں": "ں", |
|
"\x04": "", |
|
|
|
|
|
">": "ٲ", |
|
"<": "ُ", |
|
";": "ٚ", |
|
"=": "ٕ", |
|
":": "ٛ", |
|
".": "ٔ", |
|
",": "ں", |
|
"/": "" |
|
} |
|
|
|
def basic_replacements(text: str) -> str: |
|
""" |
|
Apply the dictionary-based .replace() calls for |
|
the \x04 combos and ASCII symbols. |
|
""" |
|
for old, new in mappings.items(): |
|
text = text.replace(old, new) |
|
return text |
|
|
|
|
|
def fix_alif_combo(text: str) -> str: |
|
""" Replace any occurrence of 'اٲ' with 'ٲ'. """ |
|
return text.replace("اٲ", "ٲ") |
|
|
|
|
|
def fix_question_mark(text: str) -> str: |
|
""" |
|
For each occurrence of (.)?(.) => remove '?', add "یٕ" to the 2nd letter. |
|
E.g. "س?ت" => "ستیٕ". |
|
""" |
|
def _repl(m): |
|
first_char = m.group(1) |
|
second_char = m.group(2) |
|
return f"{first_char}{second_char}یٕ" |
|
|
|
pattern = r"(.)\?(.)" |
|
return re.sub(pattern, _repl, text) |
|
|
|
|
|
def clean_line(line: str) -> str: |
|
""" |
|
Cleans a single line using: |
|
1) basic replacements (\x04 combos, ASCII symbols), |
|
2) fix_alif_combo (اٲ -> ٲ), |
|
3) fix_question_mark (س?ت -> ستیٕ) |
|
""" |
|
line = basic_replacements(line) |
|
line = fix_alif_combo(line) |
|
line = fix_question_mark(line) |
|
return line |
|
|
|
|
|
def clean_text(input_text: str) -> str: |
|
|
|
lines = input_text.splitlines() |
|
cleaned_lines = [clean_line(line) for line in lines] |
|
return "\n".join(cleaned_lines) |
|
|
|
|
|
|
|
|
|
|
|
def process_text(raw_text): |
|
""" |
|
This function is called by Gradio when the user clicks the button. |
|
It returns two outputs: |
|
1) The cleaned text (for display) |
|
2) A temporary file path with the cleaned text (for download) |
|
""" |
|
cleaned = clean_text(raw_text) |
|
|
|
|
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") |
|
tmp.write(cleaned) |
|
tmp.flush() |
|
tmp.close() |
|
|
|
return cleaned, tmp.name |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Clean Text Tool") |
|
gr.Markdown( |
|
"Paste your raw/unprocessed text below, then click 'Clean Text' to get the cleaned result." |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
raw_text = gr.Textbox( |
|
label="Input (Paste uncleaned text)", |
|
lines=15, |
|
placeholder="Paste any length of text here...", |
|
) |
|
with gr.Column(): |
|
cleaned_output = gr.Textbox( |
|
label="Output (Cleaned text)", |
|
lines=15, |
|
interactive=False |
|
) |
|
|
|
|
|
button = gr.Button("Clean Text") |
|
|
|
|
|
download_file = gr.File(label="Download Cleaned .txt File") |
|
|
|
|
|
button.click( |
|
fn=process_text, |
|
inputs=raw_text, |
|
outputs=[cleaned_output, download_file] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|