Spaces:

nielklug
/

enhg-parsing

Sleeping

App Files Files Community

nielklug commited on Jun 17, 2024

Commit

920b22f

1 Parent(s): 9b6ca8f

add tag

Browse files

Files changed (5) hide show

__pycache__/annotate.cpython-38.pyc +0 -0
__pycache__/parse.cpython-38.pyc +0 -0
annotate.py +41 -0
app.py +49 -0
requirements.txt +9 -0

__pycache__/annotate.cpython-38.pyc ADDED Viewed

Binary file (1.34 kB). View file

__pycache__/parse.cpython-38.pyc ADDED Viewed

Binary file (817 Bytes). View file

annotate.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import sys
+import argparse
+import torch
+from transformers import AutoTokenizer
+from transformers import AutoModelForTokenClassification
+def print_sentence(sentences, inputs, logits, model):
+    words, tags, prob_out = [], [], []
+    all_probs = logits.softmax(dim=2)
+    for i, sentence in enumerate(sentences):
+        # Map tokens to their respective word
+        word_ids = inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        for k, word_idx in enumerate(word_ids):
+            if word_idx is not None and word_idx != previous_word_idx:
+                # Only label the first token of a given word.
+                probs, tagIDs = all_probs[i][k].sort(descending=True)
+                label = model.config.id2label[tagIDs[0].item()]
+                prob = probs[0].item()
+                word = sentence[word_idx]
+                words.append(word)
+                tags.append(label)
+                prob_out.append(prob)
+                previous_word_idx = word_idx
+    return words, tags, prob_out
+def tag_text(text):
+    # device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cpu")
+    tokenizer = AutoTokenizer.from_pretrained("nielklug/enhg_tagger")
+    model = AutoModelForTokenClassification.from_pretrained("nielklug/enhg_tagger")
+    model = model.to(device).eval()
+    with torch.no_grad():
+        words = text.split('\n')
+        inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt")
+        logits = model(**inputs.to(device)).logits
+        return print_sentence([words], inputs, logits, model)

app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import streamlit as st
+from parse import parse_text
+from nltk import Tree
+import pandas as pd
+import re
+from nltk.tree.prettyprinter import TreePrettyPrinter
+from annotate import tag_text
+st.title("ENHG parsing system (demo)")
+text = st.text_area("""This is a simple demo of a Early New High German (ENHG) tagging and parsing system based on neural network models.\n\n
+                    Enter some ENHG text below!""")
+st.text("""Example MHG sentences:
+1. Im anfang war das Wort / Vnd das Wort war bey Gott / vnd Gott war das Wort.
+2. Darinn ain treffenliche statt, genannt Famagosta, in wölicher stat ain edler purger altz herkommens was geseßsen.""")
+def process_text(text):
+    text = re.sub(r'(["(])(\S)', r'\1 \2', text)
+    text = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', text)
+    text = re.sub(r' *$', '\n', text, flags=re.MULTILINE)
+    text = re.sub(r' +', '\n', text)
+    return text
+if text:
+  tokens, tags, probs = tag_text(process_text(text))
+  # create a table to show the tagged results:
+  zipped = list(zip(tokens, tags, probs))
+  df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
+  # # Convert the bracket parse tree into an NLTK Tree
+  # t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
+  # tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
+  col1 = st.columns(1)[0]
+  col1.header("POS tagging result:")
+  col1.table(df)
+#   col2 = st.columns(1)[0]
+#   col2.header("Parsing result:")
+#   col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
+# # Display the graph in the Streamlit app
+#   col2.image(tree_svg, use_column_width=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate==0.26.1
+huggingface-hub==0.20.2
+numpy==1.23.4
+scipy==1.10.1
+tokenizers==0.15.0
+torch==2.1.2
+transformers==4.36.2
+nltk
+torch-struct