nielklug commited on
Commit
920b22f
·
1 Parent(s): 9b6ca8f
__pycache__/annotate.cpython-38.pyc ADDED
Binary file (1.34 kB). View file
 
__pycache__/parse.cpython-38.pyc ADDED
Binary file (817 Bytes). View file
 
annotate.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import argparse
3
+ import torch
4
+ from transformers import AutoTokenizer
5
+ from transformers import AutoModelForTokenClassification
6
+
7
+
8
+ def print_sentence(sentences, inputs, logits, model):
9
+ words, tags, prob_out = [], [], []
10
+ all_probs = logits.softmax(dim=2)
11
+ for i, sentence in enumerate(sentences):
12
+ # Map tokens to their respective word
13
+ word_ids = inputs.word_ids(batch_index=i)
14
+ previous_word_idx = None
15
+ for k, word_idx in enumerate(word_ids):
16
+ if word_idx is not None and word_idx != previous_word_idx:
17
+ # Only label the first token of a given word.
18
+ probs, tagIDs = all_probs[i][k].sort(descending=True)
19
+ label = model.config.id2label[tagIDs[0].item()]
20
+ prob = probs[0].item()
21
+ word = sentence[word_idx]
22
+ words.append(word)
23
+ tags.append(label)
24
+ prob_out.append(prob)
25
+ previous_word_idx = word_idx
26
+
27
+ return words, tags, prob_out
28
+
29
+ def tag_text(text):
30
+ # device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
31
+ device = torch.device("cpu")
32
+
33
+ tokenizer = AutoTokenizer.from_pretrained("nielklug/enhg_tagger")
34
+ model = AutoModelForTokenClassification.from_pretrained("nielklug/enhg_tagger")
35
+ model = model.to(device).eval()
36
+
37
+ with torch.no_grad():
38
+ words = text.split('\n')
39
+ inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt")
40
+ logits = model(**inputs.to(device)).logits
41
+ return print_sentence([words], inputs, logits, model)
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from parse import parse_text
3
+ from nltk import Tree
4
+ import pandas as pd
5
+ import re
6
+ from nltk.tree.prettyprinter import TreePrettyPrinter
7
+ from annotate import tag_text
8
+
9
+
10
+ st.title("ENHG parsing system (demo)")
11
+ text = st.text_area("""This is a simple demo of a Early New High German (ENHG) tagging and parsing system based on neural network models.\n\n
12
+ Enter some ENHG text below!""")
13
+
14
+ st.text("""Example MHG sentences:
15
+ 1. Im anfang war das Wort / Vnd das Wort war bey Gott / vnd Gott war das Wort.
16
+ 2. Darinn ain treffenliche statt, genannt Famagosta, in wölicher stat ain edler purger altz herkommens was geseßsen.""")
17
+
18
+ def process_text(text):
19
+ text = re.sub(r'(["(])(\S)', r'\1 \2', text)
20
+ text = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', text)
21
+ text = re.sub(r' *$', '\n', text, flags=re.MULTILINE)
22
+ text = re.sub(r' +', '\n', text)
23
+ return text
24
+
25
+
26
+ if text:
27
+ tokens, tags, probs = tag_text(process_text(text))
28
+
29
+ # create a table to show the tagged results:
30
+ zipped = list(zip(tokens, tags, probs))
31
+
32
+ df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
33
+
34
+ # # Convert the bracket parse tree into an NLTK Tree
35
+ # t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
36
+
37
+ # tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
38
+
39
+ col1 = st.columns(1)[0]
40
+ col1.header("POS tagging result:")
41
+ col1.table(df)
42
+
43
+ # col2 = st.columns(1)[0]
44
+ # col2.header("Parsing result:")
45
+ # col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
46
+
47
+ # # Display the graph in the Streamlit app
48
+ # col2.image(tree_svg, use_column_width=True)
49
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.26.1
2
+ huggingface-hub==0.20.2
3
+ numpy==1.23.4
4
+ scipy==1.10.1
5
+ tokenizers==0.15.0
6
+ torch==2.1.2
7
+ transformers==4.36.2
8
+ nltk
9
+ torch-struct