Spaces:

nehalelkaref
/

flat-arabic-entity-classification

Build error

App Files Files Community

nehalelkaref commited on Nov 18, 2023

Commit

6b35cc5

1 Parent(s): 62240fd

Upload 3 files

Browse files

Files changed (3) hide show

network.py +333 -0
utils.py +420 -0
validate.py +168 -0

network.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import copy
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pad_sequence
+from torch.nn.functional import cross_entropy, binary_cross_entropy
+from tqdm.auto import tqdm
+from .utils import Config, extract_spans, generate_targets
+from .representation import TransformerRepresentation
+from .layers import SpanEnumerationLayer
+DEFAULT_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class SpanNet(nn.Module):
+    def __init__(self, **kwargs):
+        super(SpanNet, self).__init__()
+        self.config = Config()
+        self.config.pos = kwargs.get('pos', None)  # pos
+        self.config.dp = kwargs.get('dp', 0.3)  # dp
+        self.config.transformer_model_name = kwargs.get('transformer_model_name', 'bert-base-uncased')
+        self.config.token_pooling = kwargs.get('token_pooling', 'sum')
+        self.device = kwargs.get('device', DEFAULT_DEVICE)
+        self.config.repr_type = kwargs.get('repr_type', 'token_classification')
+        assert self.config.repr_type in ['token_classification',
+                                        'span_enumeration'], 'Invalid representaton type'
+        self.transformer = TransformerRepresentation(
+            model_name=self.config.transformer_model_name,
+            device=self.device).to(self.device)
+        self.transformer_dim = self.transformer.embedding_dim
+        if self.config.pos:
+            self.transformer.add_special_tokens([f'[{p}]' for p in self.config.pos])
+        self.span_tags = ['B', 'I', 'O']  # , '-']
+        self.enumeration_layer =  SpanEnumerationLayer()
+        output_size = {'token_classification': len(self.span_tags),
+                        'span_enumeration': 1}
+        self.span_output_layer = nn.Sequential(
+            nn.Linear(self.transformer_dim, self.transformer_dim),
+            nn.ReLU(), nn.Dropout(p=self.config.dp),
+            nn.Linear(self.transformer_dim, output_size[self.config.repr_type]))
+    def to_dict(self):
+        return {
+            'model_config': self.config.__dict__,
+            'model_state_dict': self.state_dict()
+        }
+    @classmethod
+    def load_model(cls, model_path, device=DEFAULT_DEVICE):
+        res = torch.load(model_path, device)
+        model = cls(**res['model_config'])
+        model.load_state_dict(res['model_state_dict'])
+        model.eval()
+        return model
+    @classmethod
+    def preds_to_sequences(self, predictions, enumerations, length):
+      # assumes the function is applied per tensor/sample
+      # sort descendindly
+      enum_preds = {predictions[idx].item(): enumerations[idx] for idx in range(len(enumerations))}
+      sorted_enum_preds = dict(sorted(enum_preds.items(), key=lambda val:val[1], reverse=True))
+      # look for clashes
+      spans = [sorted_enum_preds[key] for key in sorted_enum_preds.keys()]
+      spans_copy = [sorted_enum_preds[key] for key in sorted_enum_preds.keys()]
+      i=0
+      while(i!=(len(spans_copy))):
+        filtered_spans = []
+        s,e = spans_copy[i]
+        for j in range(i+1, len(spans_copy)):
+          sj,ej = spans_copy[j]
+          if((sj<s<=ej<e) or (sj<s<=ej<=e) or ((s<sj)&(e<ej))):
+            filtered_spans.append(spans_copy[j])
+        i+=1
+        spans_copy = [span for span in spans_copy if span not in filtered_spans]
+      chosen_indices = [spans.index(span) for span in spans_copy]
+      filtered_enum_preds = {list(sorted_enum_preds.keys())[idx]:
+      sorted_enum_preds[list(sorted_enum_preds.keys())[idx]]
+      for idx in chosen_indices}
+      # assign BIO to spans
+      tagged_seq = ['O']*length
+      for idx in range(len(spans_copy)):
+        s,e =spans_copy[idx]
+        tagged_seq[s]='B'
+        if((e-s)>0):
+          bounds = (e+1)-(s+1)
+          tagged_seq[s+1:e+1] =['I'] * bounds
+      return tagged_seq
+    def save_model(self, output_path):
+        torch.save(self.to_dict(), output_path)
+    def _extract_sentence_vectors(self, sentences, pos=None):
+        if pos and self.config.pos:
+            sentences = [[f'[{p}] {s}' for s, p in zip(s, p)]
+                         for s, p in zip(sentences, pos)]
+        outs = self.transformer(sentences, is_pretokenized=True,
+                                token_pooling=self.config.token_pooling)
+        return outs.pooled_tokens
+    def forward(self, sentences, pos=None, tags=None, **kwargs):
+        out_dict = {}
+        embs = self._extract_sentence_vectors(sentences, pos)
+        if kwargs.get('output_word_vecs', False):
+            out_dict['word_vecs'] = embeddings
+        lens = [len(s) for s in embs]
+        if self.config.repr_type == 'span_enumeration':
+          embs, enumerations = self.enumeration_layer(embs, lens)
+          lens = [len(e) for e in enumerations]
+        input_layer = pad_sequence(embs, batch_first=True)
+        span_scores = [torch.unbind(f)[:l]
+                       for f, l in zip(self.span_output_layer(input_layer), lens)]
+        if kwargs.get('output_span_scores', False):
+            out_dict['span_scores'] = span_scores
+        if self.config.repr_type == "token_classification":
+            pred_span_ids = [[torch.argmax(s) for s in sc] for sc in span_scores]
+            pred_span_tags = [[self.span_tags[idx] for idx in sequence]
+                            for sequence in pred_span_ids]
+            out_dict['pred_tags'] = pred_span_tags
+        else:
+            lens = [len(s) for s in sentences]
+            tagged_seq=[]
+            prev_enum = 0
+            for idx in range(0, len(enumerations)):
+                enum = enumerations[idx]
+                length =lens[idx]
+                scores = flat_scores[prev_enum :len(enum)+ prev_enum]
+                prev_enum = len(enum)
+                tagged_seq.append(self.preds_to_sequences(scores, enum, length))
+            out_dict['pred_tags'] = tagged_seq
+        if tags is None:
+            return out_dict
+        if self.config.repr_type == 'span_enumeration':
+          targets = generate_targets(enumerations, tags)
+          targets = torch.Tensor([t for st in targets for t in st])
+          flat_scores = torch.Tensor([t for score in span_scores for t in score])
+          print('before: ', flat_scores.shape)
+        if self.config.repr_type == 'token_classification':
+          # limit the targets of each sentence to the words not truncated during tokenization
+          targets = torch.cat(
+              [torch.tensor([self.span_tags.index(t[0]) for t, _ in zip(tg, sc)])
+              for tg, sc in zip(tags, span_scores)]).to(self.device)
+          flat_scores = torch.stack([s for tg, sc in zip(tags, span_scores) for _, s in zip(tg, sc)])
+        if self.config.repr_type == 'span_enumeration':
+          span_loss = binary_cross_entropy(flat_scores.sigmoid(), targets)
+        else:
+          span_loss = cross_entropy(flat_scores, targets)
+        out_dict['loss'] = span_loss
+        return out_dict
+    def from_span_scores(self, span_scores):
+        pred_span_ids = [[torch.argmax(s) for s in sc] for sc in span_scores]
+        return [[self.span_tags[idx] for idx in sequence]
+                for sequence in pred_span_ids]
+class EntNet(nn.Module):
+    def __init__(self, **kwargs):
+        super(EntNet, self).__init__()
+        self.config = Config()
+        self.span_net = kwargs.get('span_net')
+        self.config.tune_span_net = kwargs.get('tune_span_net', False)
+        self.config.use_span_emb = kwargs.get('use_span_emb', False)
+        self.config.use_ent_markers = kwargs.get('use_ent_markers', False)
+        # it is possible to tune span_net without using its embeddings
+        if self.span_net and not self.config.tune_span_net:
+            for p in self.span_net.parameters():
+                p.requires_grad = False
+        self.config.ent_tags = self.ent_tags = kwargs.get('ent_tags')
+        self.config.pos = kwargs.get('pos', None)
+        self.config.dp = kwargs.get('dp', 0.3)
+        self.config.transformer_model_name = kwargs.get('transformer_model_name', 'bert-base-uncased')
+        self.config.token_pooling = kwargs.get('token_pooling', 'first')
+        self.device = kwargs.get('device', DEFAULT_DEVICE)
+        self.transformer = TransformerRepresentation(
+            model_name=self.config.transformer_model_name,
+            device=self.device).to(self.device)
+        self.transformer_dim = self.transformer.embedding_dim
+        self.transformer.add_special_tokens(['[ENT]', '[/ENT]'])
+        self.transformer.add_special_tokens(['[INFO]', '[/INFO]'])
+        if self.config.pos:
+            self.transformer.add_special_tokens(
+                ['['+p+']' for p in self.config.pos])
+        self.ent_output_layer = nn.Sequential(
+            nn.Linear(2*self.transformer_dim, 2*self.transformer_dim),
+            nn.ReLU(), nn.Dropout(p=self.config.dp),
+            nn.Linear(2*self.transformer_dim, len(self.config.ent_tags)))
+    def to_dict(self):
+        return {
+            'model_config': self.config.__dict__,
+            'span_net_config': self.span_net.config.__dict__ if self.span_net is not None else None,
+            'model_state_dict': self.state_dict()
+        }
+    @classmethod
+    def load_model(cls, model_path, device=DEFAULT_DEVICE):
+        res = torch.load(model_path, device)
+        span_net = SpanNet(**res['span_net_config']) if res['span_net_config'] is not None else None
+        model = cls(span_net=span_net, **res['model_config'])
+        model.load_state_dict(res['model_state_dict'])
+        model.eval()
+        return model
+    def save_model(self, output_path):
+        torch.save(self.to_dict(), output_path)
+    def _extract_sentence_vectors(self, sentences, pos=None, ent_bounds=None):
+        if pos and self.config.pos:
+            sentences = [[f'[{p}] {s}' for s, p in zip(s, p)]
+                         for s, p in zip(sentences, pos)]
+        if ent_bounds and self.config.use_ent_markers:
+            for sent, sent_ents in zip(sentences, ent_bounds):
+                for ent in sent_ents:
+                    sent[ent[0]] = f'[ENT] {sent[ent[0]]}'
+                    sent[ent[1]] = f'{sent[ent[1]]} [/ENT]'
+        outs = self.transformer(sentences, is_pretokenized=True,
+                                token_pooling=self.config.token_pooling)
+        return outs.pooled_tokens
+    def forward(self, sentences, pos=None, tags=None, **kwargs):
+        out_dict = {}
+        pred_span_seqs = kwargs.get('pred_tags', None)
+        if pred_span_seqs is None:
+            span_out = self.span_net(sentences, pos=pos,
+                                     output_word_vecs=self.config.use_span_emb,
+                                     tags=tags if self.config.tune_span_net else None)
+            pred_span_seqs = span_out['pred_tags']
+        bounds = [[e[1] for e in extract_spans(t, tagless=True)[3]]
+                  for t in pred_span_seqs]
+        if tags is not None:
+            gold_spans = [[e for e in extract_spans(t, tagless=True)[3]]
+                          for t in tags]
+            matches = [[[g[0]
+                         for g in golds if p[0] == g[1][0] and p[1] == g[1][1]]
+                        for p in preds]
+                       for preds, golds in zip(bounds, gold_spans)]
+            targets = [[span_matches[0] if len(span_matches) == 1 else 'O'
+                        for span_matches in sent_matches]
+                       for sent_matches in matches]
+        sentences = [sent + [t for bd in sent_bounds
+                                for t in [self.transformer.tokenizer.sep_token] + sent[bd[0]:bd[1] + 1]]
+                        + [self.transformer.tokenizer.sep_token]
+                        for sent, sent_bounds in zip(sentences, bounds)]
+        sep_ids = [[i for i, s in enumerate(sent) if s == self.transformer.tokenizer.sep_token]
+                   for sent in sentences]
+        embs = self._extract_sentence_vectors(sentences, pos, bounds)
+        if kwargs.get('output_word_vecs', False):
+            out_dict['word_vecs'] = embs
+        span_vecs = [
+            torch.stack([torch.cat((torch.sum(e[b[0]:b[1] + 1], dim=0),
+                                    torch.sum(e[spi[i]:spi[i+1]+1], dim=0))) for i, b in enumerate(bd)])
+            if bd else torch.zeros((0)).to(self.device)
+            for e, bd, spi in zip(embs, bounds, sep_ids)]
+        ent_scores = [self.ent_output_layer(sv) if len(sv) else sv
+                      for sv in span_vecs]
+        if kwargs.get('output_ent_scores', False):
+            out_dict['ent_scores'] = ent_scores
+            out_dict['bounds'] = bounds
+        if tags is None:
+            max_tags = [[self.ent_tags[torch.argmax(e)] for e in es]
+                        for es in ent_scores]
+            # reconstruct sequences
+            sent_lens = [len(s) for s in sentences]
+            combined_sequences = []
+            for mt, bnd, lens in zip(max_tags, bounds, sent_lens):
+                x = ['O' for _ in range(lens)]
+                for t, b in zip(mt, bnd):
+                    x[b[0]] = 'O' if t == 'O' else f'B-{t}'
+                    for i in range(b[0] + 1, b[1] + 1):
+                        x[i] = 'O' if t == 'O' else f'I-{t}'
+                combined_sequences.append(x)
+            out_dict['pred_tags'] = combined_sequences
+            return out_dict
+        ent_targs = torch.tensor([self.ent_tags.index(t)
+                                  for targ in targets for t in targ],
+                                 dtype=torch.long).to(self.device)
+        ent_preds = torch.cat(ent_scores)
+        if not len(ent_preds):
+            out_dict['loss'] = None
+            return out_dict
+        ent_loss = cross_entropy(ent_preds, ent_targs)
+        out_dict['loss'] = ent_loss
+        if self.config.tune_span_net:
+            out_dict['loss'] += span_out['loss']
+        return out_dict
+    def from_ent_scores(self, ent_scores, sentences, bounds):
+        max_tags = [[self.ent_tags[torch.argmax(e)] for e in es]
+                    for es in ent_scores]
+        # reconstruct sequences
+        sent_lens = [len(s) for s in sentences]
+        combined_sequences = []
+        for mt, bnd, lens in zip(max_tags, bounds, sent_lens):
+            x = ['O' for _ in range(lens)]
+            for t, b in zip(mt, bnd):
+                x[b[0]] = 'O' if t == 'O' else f'B-{t}'
+                for i in range(b[0] + 1, b[1] + 1):
+                    x[i] = 'O' if t == 'O' else f'I-{t}'
+            combined_sequences.append(x)
+        return combined_sequences

utils.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import pickle
+import pyarabic.araby as araby
+# import stanza
+import numpy as np
+class Config:
+    def __init__(self):
+        super(Config, self).__init__()
+def read_conll_ner(path):
+    with open(path) as f:
+        lines = f.readlines()
+    unique_entries = []
+    sentences = []
+    curr_sentence = []
+    for line in lines:
+        if not line.strip():
+            if curr_sentence:
+                sentences.append(curr_sentence)
+            curr_sentence = []
+            continue
+        if line.startswith('#') and not curr_sentence:
+            continue
+        entry = line.split()
+        curr_sentence.append(entry)
+        if not len(unique_entries):
+            unique_entries = [[] for _ in entry[1:]]
+        for e, list in zip(entry[1:], unique_entries):
+            if e not in list:
+                list.append(e)
+    return [sentences] + unique_entries
+def read_pickled_conll(path):
+    with open(path, "rb") as f:
+        data = pickle.load(f)
+    return data
+def split_conll_docs(conll_sents, skip_docstart=True):
+    docs = []
+    curr_doc = []
+    for sent in conll_sents:
+        if sent[0][0] == '-DOCSTART-':
+            if curr_doc:
+                docs.append(curr_doc)
+                curr_doc = []
+            if skip_docstart:
+                continue
+        curr_doc.append(sent)
+    docs.append(curr_doc)
+    return docs
+def create_context_data(docs, pos_col_id=1, tag_col_id=3, context_length=1, **kwargs):
+    ctx_type = kwargs.get("ctx_type", "other")
+    sep_token = kwargs.get("sep_token", "[SEP]")
+    if ctx_type == "cand_titles":
+        # create context for candidate titles scenario
+        for doc in docs:
+            doc["ctx_sent"] = doc["query"] + [sep_token] + f"<split>{sep_token}<split>".join([cand["doc_title"] for cand in doc["BM25_cands"]]).split("<split>")
+        return docs
+    if ctx_type == "cand_links":
+        for doc in docs:
+            doc_titles_list = f"<split>{sep_token}<split>".join([cand["doc_title"] for cand in doc["BM25_cands"]]).split("<split>")
+            linked_titles_list = f"<split>{sep_token}<split>".join([linked for cand in doc["BM25_cands"] for linked in cand["linked_titles"]]).split("<split>")
+            doc["ctx_sent"] = doc["query"] + [sep_token] + doc_titles_list + [sep_token] + linked_titles_list
+        return docs
+    if ctx_type == "raw_text":
+        # create context for candidate raw text
+        for doc in docs:
+            doc["ctx_sent"] = [doc["query"] + [sep_token] + [cand["processed_text"]] for cand in doc["BM25_cands"]]
+        return docs
+    if ctx_type == 'matched_spans':
+        matched_spans = kwargs.get('matched_spans')
+        return [
+            [[t[0] for t in d] + [t for ms in ms for t in [sep_token] + ms[1]],     # sentence tokens + spans
+             None,                                                                  # pos tags
+             [s[tag_col_id] for s in d] if tag_col_id > 0 else None,                # ner tags
+             [len(d)]                                                               # sentence length
+             ]
+            for d, ms in zip(docs, matched_spans)]
+    if ctx_type == 'bm25_matched_spans':
+        matched_spans = kwargs.get('matched_spans')
+        pickled_data = kwargs.get('pickled_data')
+        docs = [[[t[0] for t in d] + [t for ms in ms for t in [sep_token] + ms[1]],  # sentence tokens + spans
+             None,  # pos tags
+             [s[tag_col_id] for s in d],  # ner tags
+             [len(d)]  # sentence length
+             ]
+            for d, ms in zip(docs, matched_spans)]
+        for ms, doc in zip(docs,pickled_data):
+            doc_titles_list = f"<split>{sep_token}<split>".join([cand["doc_title"] for cand in doc["BM25_cands"]]).split("<split>")
+            linked_titles_list = f"<split>{sep_token}<split>".join([linked for cand in doc["BM25_cands"] for linked in cand["linked_titles"]]).split("<split>")
+            ms[0] = ms[0] + [sep_token] + doc_titles_list + [sep_token] + linked_titles_list
+        return docs
+    if ctx_type == "infobox":
+        infobox_keys_path = kwargs.get("infobox_keys_path")
+        infobox_keys = read_pickled_conll(infobox_keys_path)
+        if 'pred_spans' in docs[0]:
+            docs = get_pred_ent_bounds(docs)
+        for doc in docs:
+            if 'pred_spans' in doc:
+                ents = [' '.join(doc['query'][bd[0]:bd[1] + 1]) for bd in doc['pred_ent_bounds']]
+                ents_wo_space = [''.join(doc['query'][bd[0]:bd[1] + 1]) for bd in doc['pred_ent_bounds']]
+            else:
+                ents = [' '.join(doc['query'][bd[0]:bd[1] + 1]) for bd in doc['ent_bounds']]
+                ents_wo_space = [''.join(doc['query'][bd[0]:bd[1] + 1]) for bd in doc['ent_bounds']]
+            ents = list(set(ents + ents_wo_space))
+            infobox = [infobox_keys[en] for en in ents if en in infobox_keys and infobox_keys[en]]
+            for ibs in infobox:
+                ibs[0] = '[INFO] ' + ibs[0]
+                ibs[-1] = ibs[-1] + ' [/INFO]'
+            infobox = [i for j in infobox for i in j]
+            doc["ctx_sent"] = doc["query"] + [sep_token] + infobox
+        return docs
+    # create context type for other scenarios
+    res = []
+    for doc in docs:
+        ctx_len = context_length if context_length > 0 else len(doc)
+        # for the last sentences loop around to the beginning for context
+        padded_doc = doc + doc[:ctx_len]
+        for i in range(len(doc)):
+            res.append((
+                [s[0] for sent in padded_doc[i:i+ctx_len] for s in sent],
+                [s[pos_col_id] for sent in padded_doc[i:i+ctx_len] for s in sent] if pos_col_id > 0 else None,
+                [s[tag_col_id] for sent in padded_doc[i:i+ctx_len] for s in sent],
+                [len(sent) for sent in padded_doc[i:i+ctx_len]],
+                {}  # dictionary for extra context
+            ))
+    return res
+def calc_correct(sentence):
+    gold_chunks = []
+    parallel_chunks = []
+    pred_chunks = []
+    curr_gold_chunk = []
+    curr_parallel_chunk = []
+    curr_pred_chunk = []
+    prev_tag = None
+    for line in sentence:
+        _, _, _, gt, pt = line
+        curr_tag = None
+        if '-' in pt:
+            curr_tag = pt.split('-')[1]
+        if gt.startswith('B'):
+            if curr_gold_chunk:
+                gold_chunks.append(curr_gold_chunk)
+                parallel_chunks.append(curr_parallel_chunk)
+            curr_gold_chunk = [gt]
+            curr_parallel_chunk = [pt]
+        elif gt.startswith('I') or (pt.startswith('I') and curr_tag == prev_tag
+                                    and curr_gold_chunk):
+            curr_gold_chunk.append(gt)
+            curr_parallel_chunk.append(pt)
+        elif gt.startswith('O') and pt.startswith('O'):
+            if curr_gold_chunk:
+                gold_chunks.append(curr_gold_chunk)
+                parallel_chunks.append(curr_parallel_chunk)
+                curr_gold_chunk = []
+                curr_parallel_chunk = []
+        if pt.startswith('O'):
+            if curr_pred_chunk:
+                pred_chunks.append(curr_pred_chunk)
+                curr_pred_chunk = []
+        elif pt.startswith('B'):
+            if curr_pred_chunk:
+                pred_chunks.append(curr_pred_chunk)
+            curr_pred_chunk = [pt]
+            prev_tag = curr_tag
+        else:
+            if prev_tag is not None and curr_tag != prev_tag:
+                prev_tag = curr_tag
+                if curr_pred_chunk:
+                    pred_chunks.append(curr_pred_chunk)
+                    curr_pred_chunk = []
+            curr_pred_chunk.append(pt)
+    if curr_gold_chunk:
+        gold_chunks.append(curr_gold_chunk)
+        parallel_chunks.append(curr_parallel_chunk)
+    if curr_pred_chunk:
+        pred_chunks.append(curr_pred_chunk)
+    correct = sum([1 for gc, pc in zip(gold_chunks, parallel_chunks)
+                   if not len([1 for g, p in zip(gc, pc) if g != p])])
+    correct_tagless = sum([1 for gc, pc in zip(gold_chunks, parallel_chunks)
+                   if not len([1 for g, p in zip(gc, pc) if g[0] != p[0]])])
+    # return correct, gold_chunks, parallel_chunks, pred_chunks, ob1_correct, correct_tagless
+    return {'correct': correct,
+            'correct_tagless': correct_tagless,
+            'gold_count': len(gold_chunks),
+            'pred_count': len(pred_chunks)}
+def tag_sentences(sentences):
+    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos', logging_level='WARNING')
+    tagged_sents = []
+    for sentence in sentences:
+        n = nlp(sentence)
+        tagged_sent = []
+        for s in n.sentences:
+            for w in s.words:
+                tagged_sent.append([w.text, w.upos])
+        tagged_sents.append(tagged_sent)
+    return tagged_sents
+def extract_spans(sentence, tagless=False):
+    spans_positions = []
+    span_bounds = []
+    all_bounds = []
+    span_tags = []
+    curr_tag = None
+    curr_span = []
+    curr_span_start = -1
+    # span ids, span types
+    for i, token in enumerate(sentence):
+        if token.startswith('B'):
+            if curr_span:
+                spans_positions.append([curr_span, len(all_bounds)])
+                span_bounds.append([curr_span_start, i-1])
+                all_bounds.append([[curr_span_start, i - 1], 'E', len(all_bounds)])
+                if not tagless:
+                    span_tags.append(token.split('-')[1])
+                curr_span = []
+                curr_tag = None
+            curr_span.append(token)
+            curr_tag = None if tagless else token.split('-')[1]
+            curr_span_start = i
+        elif token.startswith('I'):
+            if not tagless:
+                tag = token.split('-')[1]
+                if tag != curr_tag and curr_tag is not None:
+                    spans_positions.append([curr_span, len(all_bounds)])
+                    span_bounds.append([curr_span_start, i - 1])
+                    span_tags.append(token.split('-')[1])
+                    all_bounds.append([[curr_span_start, i - 1], 'E', len(all_bounds)])
+                    curr_span = []
+                    curr_tag = tag
+                    curr_span_start = i
+                elif curr_tag is None:
+                    curr_span = []
+                    curr_tag = tag
+                    curr_span_start = i
+            elif not curr_span:
+                curr_span_start = i
+            curr_span.append(token)
+        elif token.startswith('O') or token.startswith('-'):
+            if curr_span:
+                spans_positions.append([curr_span, len(all_bounds)])
+                span_bounds.append([curr_span_start, i-1])
+                all_bounds.append([[curr_span_start, i-1], 'E', len(all_bounds)])
+                curr_span = []
+            curr_tag = None
+            all_bounds.append([[i], 'W', len(all_bounds)])
+    # check if sentence ended with a span
+    if curr_span:
+        spans_positions.append([curr_span, len(all_bounds)])
+        span_bounds.append([curr_span_start, len(sentence) - 1])
+        all_bounds.append([[curr_span_start, len(sentence) - 1], 'E', len(all_bounds)])
+    tagged_bounds = [[loc[0][0].split('-')[1] if '-' in loc[0][0] else loc[0][0], bound]
+                     for loc, bound in zip(spans_positions, span_bounds)]
+    return spans_positions, span_bounds, all_bounds, tagged_bounds
+def ner_corpus_stats(corpus_path):
+    onto_train_cols = read_conll_ner(corpus_path)
+    tags = list(set([t.split('-')[1] for t in onto_train_cols[3] if '-' in t]))
+    onto_train_spans = [extract_spans([t[3] for t in sent])[3] for sent in
+                        onto_train_cols[0]]
+    span_lens = [span[1][1] - span[1][0] + 1 for sent in onto_train_spans for
+                 span in sent]
+    len_stats = [span_lens.count(i + 1) / len(span_lens) for i in
+                 range(max(span_lens))]
+    flat_spans = [span for sent in onto_train_spans for span in sent]
+    tag_lens_dict = {k: [] for k in tags}
+    tag_counts_dict = {k: 0 for k in tags}
+    for span in flat_spans:
+        span_length = span[1][1] - span[1][0] + 1
+        span_tag = span[0][0].split('-')[1]
+        tag_lens_dict[span_tag].append(span_length)
+        tag_counts_dict[span_tag] += 1
+    x = list(tag_counts_dict.items())
+    x.sort(key=lambda l: l[1])
+    tag_counts = [list(l) for l in x]
+    for l in tag_counts:
+        l[1] = l[1] / len(span_lens)
+    tag_len_stats = {k: [v.count(i + 1) / len(v) for i in range(max(v))]
+                     for k, v in tag_lens_dict.items()}
+    span_texts = [sent[span[1][0]:span[1][1] + 1]
+                  for sent, spans in zip(onto_train_cols[0], onto_train_spans)
+                  for span in spans]
+    span_pos = [[span[0][-1].split('-')[1], '_'.join(t[1] for t in span)]
+                for span in span_texts]
+    unique_pos = list(set([span[1] for span in span_pos]))
+    pos_dict = {k: 0 for k in unique_pos}
+    for span in span_pos:
+        pos_dict[span[1]] += 1
+    unique_pos.sort(key=lambda l: pos_dict[l], reverse=True)
+    pos_stats = [[p, pos_dict[p] / len(span_pos)] for p in unique_pos]
+    tag_pos_dict = {kt: {kp: 0 for kp in unique_pos} for kt in tags}
+    for span in span_pos:
+        tag_pos_dict[span[0]][span[1]] += 1
+    tag_pos_stats = {kt: [[p, tag_pos_dict[kt][p] / tag_counts_dict[kt]]
+                          for p in unique_pos] for kt in tags}
+    for kt in tags:
+        tag_pos_stats[kt].sort(key=lambda l: l[1], reverse=True)
+    return len_stats, tag_len_stats, tag_counts, pos_stats, tag_pos_stats
+def filter_by_max_ents(sentences, max_ent_length):
+    """
+    Filters a given list of sentences and only returns the sentences that have
+    named entities shorter than or equal to the given max_ent_length.
+    :param sentences: sentences in conll format as extracted by read_conll_ner
+    :param max_ent_length: The maximum number of tokens in an entity
+    :return: a lits of sentences
+    """
+    filtered_sents = []
+    for sent in sentences:
+        sent_span_lens = [s[1] - s[0] + 1
+                          for s in extract_spans([t[3] for t in sent])[1]]
+        if not sent_span_lens or max(sent_span_lens) <= max_ent_length:
+            filtered_sents.append(sent)
+    return filtered_sents
+def get_pred_ent_bounds(docs):
+    for doc in docs:
+        eb = []
+        count = 0
+        for p_eb in doc['pred_spans']:
+            if p_eb == 'B':
+                eb.append([count,count])
+            elif p_eb == 'I' and len(eb) > 0:
+                eb[-1][1] = count
+            count += 1
+        doc['pred_ent_bounds'] = eb
+    return docs
+def enumerate_spans(batch):
+  enumerated_spans_batch = []
+  for idx in range(0, len(batch)):
+    sentence_length = batch[idx]
+    enumerated_spans = []
+    for x in range(len(sentence_length)):
+      for y in range(x, len(sentence_length)):
+        enumerated_spans.append([x,y])
+    enumerated_spans_batch.append(enumerated_spans)
+  return enumerated_spans_batch
+def compact_span_enumeration(batch):
+  sentence_lengths = [len(b) for b in batch]
+  enumerated_spans = [[[x, y]
+                        for y in range(0, sentence_length)
+                        for x in range(sentence_length)]
+                    for sentence_length in sentence_lengths]
+  return enumerated_spans
+def preprocess_data(data):
+  clean_data = []
+  for sample in data:
+    clean_tokens = [araby.strip_tashkeel(token) for token in sample[0]]
+    clean_tokens = [araby.strip_tatweel(token) for token in clean_tokens]
+    clean_sample = [clean_tokens]
+    clean_sample.extend(sample[1:])
+    clean_data.append(clean_sample)
+  return clean_data
+def generate_targets(enumerated_spans, sentences):
+  #### could be refactored into a helper function ####
+  extracted_spans= [extract_spans(sentence,True)[3] for sentence in sentences]
+  target_locations = []
+  for span in extracted_spans:
+    sentence_locations = []
+    for location in span:
+      sentence_locations.append(location[1])
+    target_locations.append(sentence_locations)
+  #### could be refactored into a helper function ####
+  targets= []
+  for span, location_list in zip(enumerated_spans, target_locations):
+    span_arr = np.zeros_like(span).tolist()
+    target_indices = [span.index(span_location) for
+                      span_location in location_list]
+    for idx in target_indices:
+      span_arr[idx] =1
+    span_arr = [0 if x!=1 else x for x in span_arr]
+    targets.append(list(span_arr))
+  return targets
+def label_tags(tags):
+    output_tags = []
+    for tag in tags:
+        if (tag == "O"):
+            output_tags.append(0)
+        else:
+            output_tags.append(1)
+    return output_tags

validate.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import argparse
+import re
+import torch
+from tqdm.auto import tqdm
+from .network import EntNet
+from .utils import read_conll_ner, split_conll_docs, create_context_data, extract_spans
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if use_cuda else "cpu")
+def classify(model, sents, pos, batch_size):
+    model.eval()
+    result = []
+    for i in tqdm(range(0, len(sents), batch_size), desc='classifying... '):
+        tag_seqs = model(sentences=sents[i:i + batch_size],
+                         pos=pos[i:i + batch_size])
+        result.extend(tag_seqs['pred_tags'])
+    # f1, p, r
+    return [[[w, t] for w, t in zip(s, r)] for s, r in zip(sents, result)]
+def entities_from_token_classes(tokens):
+    ENTITY_BEGIN_REGEX = r"^B"  # -(\w+)"
+    ENTITY_MIDDLE_REGEX = r"^I"  # -(\w+)"
+    entities = []
+    current_entity = None
+    start_index_of_current_entity = 0
+    end_index_of_current_entity = 0
+    for i, kls in enumerate(tokens):
+        m = re.match(ENTITY_BEGIN_REGEX, kls)
+        if m is not None:
+            if current_entity is not None:
+                entities.append({
+                    "type": current_entity,
+                    "index": [start_index_of_current_entity,
+                              end_index_of_current_entity]
+                })
+            # start of entity
+            current_entity = m.string.split('-')[1] if '-' in m.string else ''
+            start_index_of_current_entity = i
+            end_index_of_current_entity = i
+            continue
+        m = re.match(ENTITY_MIDDLE_REGEX, kls)
+        if current_entity is not None:
+            if m is None:
+                # after the end of this entity
+                entities.append({
+                    "type": current_entity,
+                    "index": [start_index_of_current_entity,
+                              end_index_of_current_entity]
+                })
+                current_entity = None
+                continue
+            # in the middle of this entity
+            end_index_of_current_entity = i
+    # Add any remaining entity
+    if current_entity is not None:
+        entities.append({
+            "type": current_entity,
+            "index": [start_index_of_current_entity,
+                      end_index_of_current_entity]
+        })
+    return entities
+def calc_f1(targs, preds):
+    stat_dict = {
+        'overall': {'unl_tp': 0, 'lab_tp': 0, 'targs': 0, 'preds': 0}
+    }
+    for sent_targs, sent_preds in zip(targs, preds):
+        stat_dict['overall']['targs'] += len(sent_targs)
+        stat_dict['overall']['preds'] += len(sent_preds)
+        for pred in sent_preds:
+            if pred['type'] not in stat_dict.keys():
+                stat_dict[pred['type']] = {'lab_tp': 0, 'targs': 0, 'preds': 0}
+            stat_dict[pred['type']]['preds'] += 1
+        for targ in sent_targs:
+            if targ['type'] not in stat_dict.keys():
+                stat_dict[targ['type']] = {'lab_tp': 0, 'targs': 0, 'preds': 0}
+            stat_dict[targ['type']]['targs'] += 1
+            # is there a span that matches exactly?
+            for pred in sent_preds:
+                if pred['index'][0] == targ['index'][0] and pred['index'][1] == targ['index'][1]:
+                    stat_dict['overall']['unl_tp'] += 1
+                    # if so do the tags match exactly?
+                    if pred['type'] == targ['type']:
+                        stat_dict['overall']['lab_tp'] += 1
+                        stat_dict[targ['type']]['lab_tp'] += 1
+    for k in stat_dict.keys():
+        if k == 'overall':
+            stat_dict[k]['unl_p'] = stat_dict[k]['unl_tp'] / stat_dict[k]['preds'] if stat_dict[k]['preds'] else 0
+            stat_dict[k]['unl_r'] = stat_dict[k]['unl_tp'] / stat_dict[k]['targs'] if stat_dict[k]['targs'] else 0
+            stat_dict[k]['unl_f1'] = 2 * stat_dict[k]['unl_p'] * stat_dict[k]['unl_r'] / (
+                        stat_dict[k]['unl_p'] + stat_dict[k]['unl_r']) if (
+                        stat_dict[k]['unl_p'] + stat_dict[k]['unl_r']) else 0
+        stat_dict[k]['lab_p'] = stat_dict[k]['lab_tp'] / stat_dict[k]['preds'] if stat_dict[k]['preds'] else 0
+        stat_dict[k]['lab_r'] = stat_dict[k]['lab_tp'] / stat_dict[k]['targs'] if stat_dict[k]['targs'] else 0
+        stat_dict[k]['lab_f1'] = 2 * stat_dict[k]['lab_p'] * stat_dict[k]['lab_r'] / (
+                stat_dict[k]['lab_p'] + stat_dict[k]['lab_r']) if (stat_dict[k]['lab_p'] + stat_dict[k]['lab_r']) else 0
+    class_f1s = [v['lab_f1'] for k, v in stat_dict.items() if k != 'overall']
+    stat_dict['overall']['macro_lab_f1'] = sum(class_f1s) / len(class_f1s)
+    return stat_dict
+def main(args):
+    global device
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    test_columns = read_conll_ner(args.test_path)
+    test_docs = split_conll_docs(test_columns[0])
+    test_data = create_context_data(test_docs, args.context_size)
+    sents = [td[0] for td in test_data]
+    pos = [td[1] for td in test_data]
+    if len(args.model_path) > 1 or args.span_model_path is not None:
+        model = StagedEnsemble(model_paths=args.model_path, span_model_paths=args.span_model_path, device=device)
+    else:
+        model = EntNet.load_model(args.model_path[0], device=device)
+    model.to(device)
+    BATCH_SIZE = args.batch_size
+    res = classify(model, sents, pos, BATCH_SIZE)
+    targets = [td[2] for td in test_data]
+    targ_tags = [entities_from_token_classes(td[2]) for td in test_data]
+    pred_tags = [entities_from_token_classes([t[1] for t in r]) for r in res]
+    result = calc_f1(targ_tags, pred_tags)
+    print(f'Overall unlabelled - F1:{result["overall"]["unl_f1"]}, '
+          f'P:{result["overall"]["unl_p"]}, '
+          f'R:{result["overall"]["unl_r"]}')
+    print(f'Overall labelled - Micro F1:{result["overall"]["lab_f1"]}, '
+          f'P:{result["overall"]["lab_p"]}, '
+          f'R:{result["overall"]["lab_r"]}')
+    print(f'Overall labelled - Macro F1:{result["overall"]["macro_lab_f1"]}')
+    for k, v in result.items():
+        if k == 'overall':
+            continue
+        print(f'{k} - F1:{v["lab_f1"]}, P:{v["lab_p"]}, R:{v["lab_r"]}')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, nargs='+', default=None, required=True, help='')
+    parser.add_argument('--span_model_path', type=str, nargs='*', default=None, help='')
+    # parser.add_argument('--network_type', type=str,
+    #                     choices=['span', 'entity', 'joint'], required=True,
+    #                     default=None, help='If entity is chosen, a path to a '
+    #                                        'span model is required also')
+    parser.add_argument('--test_path', type=str, default=None, help='')
+    parser.add_argument('--context_size', type=int, default=1, help='')
+    parser.add_argument('--batch_size', type=int, default=8, help='')
+    # parser.add_argument('--cuda_id', type=int, default=0, help='')
+    args = parser.parse_args()
+    main(args)