Spaces:

bowphs
/

athenas-lens

Sleeping

App Files Files Community

bowphs commited on Aug 17, 2023

Commit

3bc4816

1 Parent(s): 0f418b3

Add initial attempt of a code framework.

Browse files

Files changed (6) hide show

README.md +5 -5
app.py +342 -0
models.py +112 -0
requirements.txt +8 -0
scrollbar.css +30 -0
utils.py +161 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Athenas Lens
-emoji: 📈
-colorFrom: yellow
-colorTo: blue
 sdk: gradio
-sdk_version: 3.40.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: Athena's Lens
+emoji: 🦉
+colorFrom: red
+colorTo: gray
 sdk: gradio
+sdk_version: 3.3.1
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,342 @@

+from typing import Tuple, List, Union, Dict, Mapping
+import base64
+import os
+from bs4 import BeautifulSoup
+import gradio as gr
+from spacy import displacy
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    BatchEncoding,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForTokenClassification,
+)
+import torch
+from utils import get_dependencies, preprocess_text
+from models import (
+    DependencyRobertaForTokenClassification,
+    LabelRobertaForTokenClassification,
+)
+DEFAULT_TEXT = "τίω δέ μιν ἐν καρὸς αἴσῃ."
+BUTTON_CSS = "float: right; --tw-border-opacity: 1; border-color: rgb(229 231 235 / var(--tw-border-opacity)); --tw-gradient-from: rgb(243 244 246 / 0.7); --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to, rgb(243 244 246 / 0)); --tw-gradient-to: rgb(229 231 235 / 0.8); --tw-text-opacity: 1; color: rgb(55 65 81 / var(--tw-text-opacity));    border-width: 1px; --tw-bg-opacity: 1; background-color: rgb(255 255 255 / var(--tw-bg-opacity)); background-image: linear-gradient(to bottom right, var(--tw-gradient-stops)); display: inline-flex; flex: 1 1 0%; align-items: center; justify-content: center;    --tw-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); --tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color); box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow); -webkit-appearance: button; border-radius: 0.5rem; padding-top: 0.5rem; padding-bottom: 0.5rem; padding-left: 1rem; padding-right: 1rem; font-size: 1rem; line-height: 1.5rem; font-weight: 600;"
+DEFAULT_COLOR = "white"
+MODEL_PATHS = {
+    "POS": "bowphs/testid",
+    "LEMMATIZATION": "bowphs/lemmatization-demo",
+    "DEPENDENCY": "bowphs/depenBERTa_perseus",
+    "LABELS": "bowphs/depenBERTa_labler_perseus",
+}
+MODEL_MAX_LENGTH = 512
+AUTH_TOKEN = os.environ.get("TOKEN") or True
+# PoS
+pos_tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATHS["POS"], model_max_length=MODEL_MAX_LENGTH, use_auth_token=AUTH_TOKEN
+)
+pos_model = AutoModelForTokenClassification.from_pretrained(
+    MODEL_PATHS["POS"], use_auth_token=AUTH_TOKEN
+)
+# Lemmatization
+lemmatizer_tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATHS["LEMMATIZATION"],
+    model_max_length=MODEL_MAX_LENGTH,
+    use_auth_token=AUTH_TOKEN,
+)
+lemmatizer_model = AutoModelForSeq2SeqLM.from_pretrained(
+    MODEL_PATHS["LEMMATIZATION"], use_auth_token=AUTH_TOKEN
+)
+# Dependency Parsing
+dependency_tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATHS["DEPENDENCY"],
+    model_max_length=MODEL_MAX_LENGTH,
+    use_auth_token=AUTH_TOKEN,
+)
+arcs_model = DependencyRobertaForTokenClassification.from_pretrained(
+    MODEL_PATHS["DEPENDENCY"], use_auth_token=AUTH_TOKEN
+)
+labels_model = LabelRobertaForTokenClassification.from_pretrained(
+    MODEL_PATHS["LABELS"], use_auth_token=AUTH_TOKEN
+)
+data_collator = DataCollatorForTokenClassification(dependency_tokenizer)
+def is_valid_selection(col_arcs, col_labels) -> bool:
+    if not col_arcs and col_labels:
+        return False
+    return True
+def get_pos_predictions(inputs) -> torch.Tensor:
+    """Get part of speech predictions."""
+    return pos_model(inputs["input_ids"]).logits.argmax(-1)  # type: ignore
+def execute_parse(
+    text_input: str,
+    col_pos: bool,
+    col_arcs: bool,
+    col_labels: bool,
+    col_lemmata: bool,
+    compact: bool,
+    bg: str,
+    text: str,
+) -> Tuple[str, str]:
+    if is_valid_selection(col_arcs, col_labels):
+        return parse(
+            text_input, col_pos, col_arcs, col_labels, col_lemmata, compact, bg, text
+        )
+    return "Please check 'Dependency Arcs' before checking 'Dependency Labels'", ""
+def lemmatize(tokens: List[str]) -> List[str]:
+    def construct_task(word_idx: int) -> str:
+        return f"lemmatize: {' '.join(tokens[:word_idx])} <extra_id_0> {tokens[word_idx]} <extra_id_1> {' '.join(list(tokens[word_idx]))} <extra_id_2> {' '.join(tokens[word_idx+1:])}"
+    predictions = [
+        lemmatizer_tokenizer.decode(
+            lemmatizer_model.generate(
+                lemmatizer_tokenizer(construct_task(word_idx), return_tensors="pt")[
+                    "input_ids"
+                ],
+                max_length=20,
+                num_beams=5,
+                num_return_sequences=1,
+                early_stopping=True,
+            )[0],
+            skip_special_tokens=True,
+        )
+        for word_idx in range(len(tokens))
+    ]
+    return predictions
+def add_lemma_visualization(soup, lemmata: List[str], col_arcs: bool) -> str:
+    for token, lemma in zip(soup.find_all(class_="displacy-token")[col_arcs:], lemmata):
+        pos_tag = token.find(class_="displacy-tag")
+        lemma_tag = soup.new_tag(
+            "tspan",
+            class_="displacy-lemma",
+            dy="2em",
+            fill="currentColor",
+            x=pos_tag.attrs["x"],
+        )
+        lemma_tag.string = lemma
+        pos_tag.insert_after(lemma_tag)
+    return str(soup)
+def download_svg(svg):
+    encode = base64.b64encode(bytes(svg, "utf-8"))
+    img = "data:image/svg+xml;base64," + str(encode)[2:-1]
+    html = f'<a download="displacy.svg" href="{img}" style="{BUTTON_CSS}">Download as SVG</a>'
+    return html
+def prepare_doc(
+    tokens: List[str], col_pos: bool, pos_outputs: torch.Tensor, inputs: BatchEncoding,
+) -> Dict[str, List[Dict[str, str]]]:
+    doc: Dict[str, List[Dict[str, str]]] = {
+        "words": [], #[{"text": "ROOT", "tag": ""}],
+        "arcs": [],
+    }
+    word_ids = inputs.word_ids()
+    previous_word_idx = None
+    for idx, word_idx in enumerate(word_ids):
+        if word_idx != previous_word_idx and word_idx is not None:
+            tag_repr = (
+                pos_model.config.id2label[pos_outputs[0][idx].item()] if col_pos else ""
+            )
+            doc["words"].append({"text": tokens[word_idx], "tag": tag_repr})
+            previous_word_idx = word_idx
+    return doc
+def parse(
+    text_input: str,
+    col_pos: bool,
+    col_arcs: bool,
+    col_labels: bool,
+    col_lemmata: bool,
+    compact: bool,
+    bg: str,
+    text: str,
+) -> Tuple[str, str]:
+    tokens = preprocess_text(text_input)
+    inputs = pos_tokenizer(
+        tokens,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        is_split_into_words=True,
+    )
+    pos_outputs = get_pos_predictions(inputs)
+    doc = prepare_doc(tokens, col_pos, pos_outputs, inputs)
+    if col_arcs:
+        doc["words"].insert(0, {"text": "ROOT", "tag": ""})
+        doc["arcs"] = get_dependencies(
+            arcs_model,
+            labels_model,
+            dependency_tokenizer,
+            data_collator,
+            col_labels,
+            tokens,
+        )["arcs"]
+    options = {"compact": compact, "bg": bg, "color": text}
+    svg = displacy.render(doc, manual=True, style="dep", options=options)
+    if col_lemmata:
+        soup = BeautifulSoup(svg, "lxml-xml")
+        lemmata = lemmatize(tokens)
+        svg = add_lemma_visualization(soup, lemmata, col_arcs)
+    download_link = download_svg(svg)
+    return svg, download_link
+def setup_parser_ui():
+    demo = gr.Blocks(css="scrollbar.css")
+    with demo:
+        with gr.Box():
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("# Athena's Lens")
+                    gr.Markdown(
+                        "### From Ἀlkaios to Ὠrigen: A Modern Lens on Timeless Texts"
+                    )
+        with gr.Box():
+            with gr.Column():
+                gr.Markdown(" ## Enter some text")
+                with gr.Row():
+                    with gr.Column(scale=0.5):
+                        text_input = gr.Textbox(
+                            value=DEFAULT_TEXT, interactive=True, label="Input Text"
+                        )
+                with gr.Row():
+                    with gr.Column(scale=0.25):
+                        button = gr.Button("Update", variant="primary").style(
+                            full_width=False
+                        )
+        with gr.Box():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("## Parser")
+                with gr.Row():
+                    with gr.Column():
+                        col_pos = gr.Checkbox(label="PoS Labels", value=True)
+                        col_arcs = gr.Checkbox(label="Dependency Arcs", value=False)
+                        col_labels = gr.Checkbox(label="Dependency Labels", value=False)
+                        col_lemmata = gr.Checkbox(label="Lemmata", value=False)
+                        compact = gr.Checkbox(label="Compact", value=False)
+                    with gr.Column():
+                        bg = gr.Textbox(label="Background Color", value=DEFAULT_COLOR)
+                    with gr.Column():
+                        text = gr.Textbox(label="Text Color", value="black")
+                with gr.Row():
+                    dep_output = gr.HTML(
+                        value=parse(
+                            DEFAULT_TEXT,
+                            True,
+                            False,
+                            False,
+                            False,
+                            False,
+                            DEFAULT_COLOR,
+                            "black",
+                        )[0]
+                    )
+                with gr.Row():
+                    with gr.Column(scale=0.25):
+                        dep_button = gr.Button(
+                            "Update Parser", variant="primary"
+                        ).style(full_width=False)
+                    with gr.Column():
+                        dep_download_button = gr.HTML(
+                            value=download_svg(dep_output.value)
+                        )
+            with gr.Box():
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown("## Contact")
+                            gr.Markdown(
+                                "If you have any questions, suggestions, comments, or problems, feel free to [reach out](mailto:[email protected])."
+                            )
+                            gr.Markdown("## Citation")
+                            gr.Markdown(
+                                "This space uses models from [this](https://aclanthology.org/2023.acl-long.846.pdf) paper."
+                            )
+                            gr.Markdown(
+                                """```bibtex
+    @incollection{riemenschneider-frank-2023-exploring,
+        title = "Exploring Large Language Models for Classical Philology",
+        author = "Riemenschneider, Frederick  and Frank, Anette",
+        booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+        month = jul,
+        year = "2023",
+        address = "Toronto, Canada",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2023.acl-long.846",
+        doi = "10.18653/v1/2023.acl-long.846",
+        pages = "15181--15199",
+    }
+    ```
+                    """
+                            )
+        button.click(
+            execute_parse,
+            inputs=[
+                text_input,
+                col_pos,
+                col_arcs,
+                col_labels,
+                col_lemmata,
+                compact,
+                bg,
+                text,
+            ],
+            outputs=[dep_output, dep_download_button],
+        )
+        dep_button.click(
+            execute_parse,
+            inputs=[
+                text_input,
+                col_pos,
+                col_arcs,
+                col_labels,
+                col_lemmata,
+                compact,
+                bg,
+                text,
+            ],
+            outputs=[dep_output, dep_download_button],
+        )
+    demo.launch()
+def main():
+    demo = setup_parser_ui()
+    demo.launch()
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from torch import nn
+from transformers import RobertaPreTrainedModel
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.models.roberta.modeling_roberta import RobertaConfig, RobertaModel
+from utils import batched_index_select
+class DependencyRobertaForTokenClassification(RobertaPreTrainedModel):
+    config_class = RobertaConfig  # type: ignore
+    def __init__(self, config):
+        super().__init__(config)
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.u_a = nn.Linear(768, 768)
+        self.w_a = nn.Linear(768, 768)
+        self.v_a_inv = nn.Linear(768, 1, bias=False)
+        self.criterion = nn.NLLLoss()
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        labels=None,
+        **kwargs,
+    ):
+        loss = 0.0
+        output = self.roberta(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
+        )[0]
+        batch_size, seq_len, _ = output.size()
+        parent_prob_table = []
+        for i in range(0, seq_len):
+            target = output[:, i, :].expand(seq_len, batch_size, -1).transpose(0, 1)
+            mask = output.eq(target)[:, :, 0].unsqueeze(2)
+            p_head = self.attention(output, target, mask)
+            if labels is not None:
+                current_loss = self.criterion(p_head.squeeze(-1), labels[:, i])
+                if not torch.all(labels[:, i] == -100):
+                    loss += current_loss
+            parent_prob_table.append(torch.exp(p_head))
+        parent_prob_table = torch.cat((parent_prob_table), dim=2).data.transpose(1, 2)
+        prob, topi = parent_prob_table.topk(k=1, dim=2)
+        preds = topi.squeeze(-1)
+        loss = loss / seq_len
+        output = TokenClassifierOutput(loss=loss, logits=preds)
+        if labels is not None:
+            return output, preds, parent_prob_table, labels
+        else:
+            return output, preds, parent_prob_table
+    def attention(self, source, target, mask=None):
+        function_g = self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
+        if mask is not None:
+            function_g.masked_fill_(mask, -1e4)
+        return nn.functional.log_softmax(function_g, dim=1)
+class LabelRobertaForTokenClassification(RobertaPreTrainedModel):
+    config_class = RobertaConfig  # type: ignore
+    def __init__(self, config):
+        super().__init__(config)
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.num_labels = 33
+        self.hidden = nn.Linear(768 * 2, 768)
+        self.relu = nn.ReLU()
+        self.out = nn.Linear(768, self.num_labels)
+        self.loss_fct = nn.CrossEntropyLoss()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        labels=None,
+        **kwargs,
+    ):
+        loss = 0.0
+        output = self.roberta(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
+        )[0]
+        batch_size, seq_len, _ = output.size()
+        logits = []
+        for i in range(seq_len):
+            current_token = output[:, i, :]
+            connected_with_index = kwargs["head_labels"][:, i]
+            connected_with_index[connected_with_index == -100] = 0
+            connected_with_embedding = batched_index_select(
+                output.clone(), 1, connected_with_index.clone()
+            )
+            combined_embeddings = torch.cat(
+                (current_token, connected_with_embedding.squeeze(1)), -1
+            )
+            pred = self.out(self.relu(self.hidden(combined_embeddings)))
+            pred = pred.view(-1, self.num_labels)
+            logits.append(pred)
+            if labels is not None:
+                current_loss = self.loss_fct(pred, labels[:, i].view(-1))
+                if not torch.all(labels[:, i] == -100):
+                    loss += current_loss
+        loss = loss / seq_len
+        logits = torch.stack(logits, dim=1)
+        output = TokenClassifierOutput(loss=loss, logits=logits)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas==1.4.2
+gradio==3.3.1
+beautifulsoup4
+lxml
+ufal.chu-liu-edmonds
+spacy
+transformers
+torch

scrollbar.css ADDED Viewed

	@@ -0,0 +1,30 @@

+.output-html {
+    overflow-x: auto;
+}
+.output-html::-webkit-scrollbar {
+    -webkit-appearance: none;
+}
+.output-html::-webkit-scrollbar:vertical {
+    width: 0px;
+}
+.output-html::-webkit-scrollbar:horizontal {
+    height: 11px;
+}
+.output-html::-webkit-scrollbar-thumb {
+    border-radius: 8px;
+    border: 2px solid white;
+    background-color: rgba(0, 0, 0, .5);
+}
+.output-html::-webkit-scrollbar-track {
+    background-color: #fff;
+    border-radius: 8px;
+}
+.spans {
+    min-height: 75px;
+}

utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import re
+from typing import List, Dict, Set
+import numpy as np
+import torch
+from ufal.chu_liu_edmonds import chu_liu_edmonds
+DEPENDENCY_RELATIONS = [
+    "acl",
+    "advcl",
+    "advmod",
+    "amod",
+    "appos",
+    "aux",
+    "case",
+    "cc",
+    "ccomp",
+    "conj",
+    "cop",
+    "csubj",
+    "det",
+    "iobj",
+    "mark",
+    "nmod",
+    "nsubj",
+    "nummod",
+    "obj",
+    "obl",
+    "parataxis",
+    "punct",
+    "root",
+    "vocative",
+    "xcomp",
+]
+INDEX2TAG = {idx: tag for idx, tag in enumerate(DEPENDENCY_RELATIONS)}
+TAG2INDEX = {tag: idx for idx, tag in enumerate(DEPENDENCY_RELATIONS)}
+def preprocess_text(text: str) -> List[str]:
+    text = text.strip()
+    text = re.sub("(?<! )(?=[.,!?()·;:])|(?<=[.,!?()·;:])(?! )", r" ", text)
+    return text.split()
+def batched_index_select(
+    input: torch.Tensor, dim: int, index: torch.Tensor
+) -> torch.Tensor:
+    views = [input.shape[0]] + [
+        1 if i != dim else -1 for i in range(1, len(input.shape))
+    ]
+    expanse = list(input.shape)
+    expanse[0] = -1
+    expanse[dim] = -1
+    index = index.view(views).expand(expanse)
+    return torch.gather(input, dim, index)
+def get_relevant_tokens(tokenized: torch.Tensor, start_ids: Set[int]) -> List[int]:
+    return [tokenized[idx].item() for idx in range(len(tokenized)) if idx in start_ids]
+def resolve(
+    edmonds_head: List[int], word_ids: List[int], parent_probs_table: torch.Tensor
+) -> torch.Tensor:
+    multiple_roots = [i for i, x in enumerate(edmonds_head) if x == 0]
+    if len(multiple_roots) > 1:
+        main_root = max(multiple_roots, key=edmonds_head.count)
+        secondary_roots = set(multiple_roots) - {main_root}
+        for root in secondary_roots:
+            parent_probs_table[0][word_ids.index(root)][0] = 0
+    return parent_probs_table
+def apply_chu_liu_edmonds(
+    parent_probs_table: torch.Tensor, tokenized_input: Dict, start_ids: Set[int]
+) -> List[int]:
+    parent_probs_table = (
+        parent_probs_table
+        if parent_probs_table.shape[1] == parent_probs_table.shape[2]
+        else parent_probs_table[:, :, 1:]
+    )
+    edmonds_heads, _ = chu_liu_edmonds(
+        parent_probs_table.squeeze(0).cpu().numpy().astype("double")
+    )
+    edmonds_heads = torch.tensor(edmonds_heads).unsqueeze(0)
+    edmonds_heads[edmonds_heads == -1] = 0
+    tokenized_input["head_labels"] = edmonds_heads
+    return get_relevant_tokens(edmonds_heads[0], start_ids)
+def get_word_endings(tokenized_input):
+    word_ids = tokenized_input.word_ids(batch_index=0)
+    start_ids = set()
+    word_endings = {0: (1, 0)}
+    for word_id in word_ids:
+        if word_id is not None:
+            start, end = tokenized_input.word_to_tokens(
+                batch_or_word_index=0, word_index=word_id
+            )
+            start_ids.add(start)
+            word_endings[start] = (end, word_id + 1)
+            for a in range(start + 1, end + 1):
+                word_endings[a] = (end, word_id + 1)
+    return word_endings, start_ids, word_ids
+def get_dependencies(
+    dependency_parser,
+    label_parser,
+    tokenizer,
+    collator,
+    labels: bool,
+    sentence: List[str],
+) -> Dict:
+    tokenized_input = tokenizer(
+        sentence, truncation=True, is_split_into_words=True, add_special_tokens=True
+    )
+    dep_dict: Dict[str, List[Dict[str, str]]] = {
+        "words": [{"text": "ROOT", "tag": ""}],
+        "arcs": [],
+    }
+    word_endings, start_ids, word_ids = get_word_endings(tokenized_input)
+    tokenized_input = collator([tokenized_input])
+    _, _, parent_probs_table = dependency_parser(**tokenized_input)
+    irrelevant = torch.tensor(
+        [
+            idx.item()
+            for idx in torch.arange(parent_probs_table.size(1))
+            if idx.item() not in start_ids and idx.item() != 0
+        ]
+    )
+    if irrelevant.nelement() > 0:
+        parent_probs_table.index_fill_(1, irrelevant, torch.nan)
+        parent_probs_table.index_fill_(2, irrelevant, torch.nan)
+    edmonds_head = apply_chu_liu_edmonds(parent_probs_table, tokenized_input, start_ids)
+    parent_probs_table = resolve(edmonds_head, word_ids, parent_probs_table)
+    edmonds_head = apply_chu_liu_edmonds(parent_probs_table, tokenized_input, start_ids)
+    if labels:
+        predictions_labels = np.argmax(
+            label_parser(**tokenized_input).logits.detach().cpu().numpy(), axis=-1
+        )
+        predicted_relations = get_relevant_tokens(predictions_labels[0], start_ids)
+        predicted_relations = [
+            INDEX2TAG[predicted_relations[idx]] for idx in range(len(sentence))
+        ]
+    else:
+        predicted_relations = [""] * len(sentence)
+    for idx, head in enumerate(edmonds_head):
+        arc = {
+            "start": min(idx + 1, word_endings[head][1]),
+            "end": max(idx + 1, word_endings[head][1]),
+            "label": predicted_relations[idx],
+            "dir": "left" if idx + 1 < word_endings[head][1] else "right",
+        }
+        dep_dict["arcs"].append(arc)
+    return dep_dict