enhg-parsing / app.py
nielklug's picture
update
076c36d
raw
history blame
1.78 kB
import streamlit as st
from parse import parse
from nltk import Tree
import pandas as pd
import re
from nltk.tree.prettyprinter import TreePrettyPrinter
from annotate import tag_text
st.title("ENHG parsing system (demo)")
text = st.text_area("""This is a simple demo of a Early New High German (ENHG) tagging and parsing system based on BERT language models.\n\n
Enter some ENHG text below!""")
st.text("""Example MHG sentences:
1. Im anfang war das Wort / Vnd das Wort war bey Gott / vnd Gott war das Wort.
2. Darinn ain treffenliche statt, genannt Famagosta, in wölicher stat ain edler purger altz herkommens was geseßsen.""")
def process_text(text):
text = re.sub(r'(["(])(\S)', r'\1 \2', text)
text = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', text)
text = re.sub(r' *$', '\n', text, flags=re.MULTILINE)
text = re.sub(r' +', '\n', text)
return text
if text:
tokens, tags, probs = tag_text(process_text(text))
# create a table to show the tagged results:
zipped = list(zip(tokens, tags, probs))
df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
parse_tree = parse(tokens)
# Convert the bracket parse tree into an NLTK Tree
mod_tree = parse_tree.replace("$(", "$LRB").replace("$)", "$RRB")
# t = Tree.fromstring(re.sub(r'(-\w+)+', '', mod_tree))
# t = Tree.fromstring(mod_tree)
tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
col1 = st.columns(1)[0]
col1.header("POS tagging result:")
col1.table(df)
col2 = st.columns(1)[0]
col2.header("Parsing result:")
col2.write(mod_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
# Display the graph in the Streamlit app
# col2.image(tree_svg, use_column_width=True)