Spaces:
Sleeping
Sleeping
add tag
Browse files- __pycache__/annotate.cpython-38.pyc +0 -0
- __pycache__/parse.cpython-38.pyc +0 -0
- annotate.py +41 -0
- app.py +49 -0
- requirements.txt +9 -0
__pycache__/annotate.cpython-38.pyc
ADDED
Binary file (1.34 kB). View file
|
|
__pycache__/parse.cpython-38.pyc
ADDED
Binary file (817 Bytes). View file
|
|
annotate.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import argparse
|
3 |
+
import torch
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
from transformers import AutoModelForTokenClassification
|
6 |
+
|
7 |
+
|
8 |
+
def print_sentence(sentences, inputs, logits, model):
|
9 |
+
words, tags, prob_out = [], [], []
|
10 |
+
all_probs = logits.softmax(dim=2)
|
11 |
+
for i, sentence in enumerate(sentences):
|
12 |
+
# Map tokens to their respective word
|
13 |
+
word_ids = inputs.word_ids(batch_index=i)
|
14 |
+
previous_word_idx = None
|
15 |
+
for k, word_idx in enumerate(word_ids):
|
16 |
+
if word_idx is not None and word_idx != previous_word_idx:
|
17 |
+
# Only label the first token of a given word.
|
18 |
+
probs, tagIDs = all_probs[i][k].sort(descending=True)
|
19 |
+
label = model.config.id2label[tagIDs[0].item()]
|
20 |
+
prob = probs[0].item()
|
21 |
+
word = sentence[word_idx]
|
22 |
+
words.append(word)
|
23 |
+
tags.append(label)
|
24 |
+
prob_out.append(prob)
|
25 |
+
previous_word_idx = word_idx
|
26 |
+
|
27 |
+
return words, tags, prob_out
|
28 |
+
|
29 |
+
def tag_text(text):
|
30 |
+
# device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
|
31 |
+
device = torch.device("cpu")
|
32 |
+
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained("nielklug/enhg_tagger")
|
34 |
+
model = AutoModelForTokenClassification.from_pretrained("nielklug/enhg_tagger")
|
35 |
+
model = model.to(device).eval()
|
36 |
+
|
37 |
+
with torch.no_grad():
|
38 |
+
words = text.split('\n')
|
39 |
+
inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt")
|
40 |
+
logits = model(**inputs.to(device)).logits
|
41 |
+
return print_sentence([words], inputs, logits, model)
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from parse import parse_text
|
3 |
+
from nltk import Tree
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
from nltk.tree.prettyprinter import TreePrettyPrinter
|
7 |
+
from annotate import tag_text
|
8 |
+
|
9 |
+
|
10 |
+
st.title("ENHG parsing system (demo)")
|
11 |
+
text = st.text_area("""This is a simple demo of a Early New High German (ENHG) tagging and parsing system based on neural network models.\n\n
|
12 |
+
Enter some ENHG text below!""")
|
13 |
+
|
14 |
+
st.text("""Example MHG sentences:
|
15 |
+
1. Im anfang war das Wort / Vnd das Wort war bey Gott / vnd Gott war das Wort.
|
16 |
+
2. Darinn ain treffenliche statt, genannt Famagosta, in wölicher stat ain edler purger altz herkommens was geseßsen.""")
|
17 |
+
|
18 |
+
def process_text(text):
|
19 |
+
text = re.sub(r'(["(])(\S)', r'\1 \2', text)
|
20 |
+
text = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', text)
|
21 |
+
text = re.sub(r' *$', '\n', text, flags=re.MULTILINE)
|
22 |
+
text = re.sub(r' +', '\n', text)
|
23 |
+
return text
|
24 |
+
|
25 |
+
|
26 |
+
if text:
|
27 |
+
tokens, tags, probs = tag_text(process_text(text))
|
28 |
+
|
29 |
+
# create a table to show the tagged results:
|
30 |
+
zipped = list(zip(tokens, tags, probs))
|
31 |
+
|
32 |
+
df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
|
33 |
+
|
34 |
+
# # Convert the bracket parse tree into an NLTK Tree
|
35 |
+
# t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
|
36 |
+
|
37 |
+
# tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
|
38 |
+
|
39 |
+
col1 = st.columns(1)[0]
|
40 |
+
col1.header("POS tagging result:")
|
41 |
+
col1.table(df)
|
42 |
+
|
43 |
+
# col2 = st.columns(1)[0]
|
44 |
+
# col2.header("Parsing result:")
|
45 |
+
# col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
|
46 |
+
|
47 |
+
# # Display the graph in the Streamlit app
|
48 |
+
# col2.image(tree_svg, use_column_width=True)
|
49 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.26.1
|
2 |
+
huggingface-hub==0.20.2
|
3 |
+
numpy==1.23.4
|
4 |
+
scipy==1.10.1
|
5 |
+
tokenizers==0.15.0
|
6 |
+
torch==2.1.2
|
7 |
+
transformers==4.36.2
|
8 |
+
nltk
|
9 |
+
torch-struct
|