Spaces:

wasimmadha
/

entity-extraction

Runtime error

App Files Files Community

wasimmadha commited on May 8, 2023

Commit

66e5e13

1 Parent(s): bdec808

Add application file

Browse files

Files changed (13) hide show

app.py +57 -0
configuration.py +13 -0
dataset.py +12 -0
model.py +46 -0
models_file/config.pth +3 -0
models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth +3 -0
models_file/tokenizer/merges.txt +0 -0
models_file/tokenizer/special_tokens_map.json +51 -0
models_file/tokenizer/tokenizer.json +0 -0
models_file/tokenizer/tokenizer_config.json +66 -0
models_file/tokenizer/vocab.json +0 -0
requirements.txt +5 -0
utils.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+if __name__ == '__main__':
+    inputs = ['gbjjhbdjhbdgjhdbfjhsdkjrkjf', 'fdjhbjhsbd']
+    from transformers import AutoTokenizer
+    from model import CustomModel
+    import torch
+    from configuration import CFG
+    from dataset import SingleInputDataset
+    from torch.utils.data import DataLoader
+    from utils import inference_fn, get_char_probs, get_results, get_text
+    import numpy as np
+    import gradio as gr
+    device = torch.device('cpu')
+    tokenizer = AutoTokenizer.from_pretrained('models_file/tokenizer')
+    model = CustomModel(CFG, config_path='models_file\config.pth', pretrained=False)
+    state = torch.load('models_file\microsoft-deberta-base_0.9449373420387531_8_best.pth',
+                        map_location=torch.device('cpu'))
+    model.load_state_dict(state['model'])
+    def get_answer(context, feature):
+        ## Input to the model using patient-history and feature-text
+        inputs_single = tokenizer(context, feature,
+                                    add_special_tokens=True,
+                                    max_length=CFG.max_len,
+                                    padding="max_length",
+                                    return_offsets_mapping=False)
+        for k, v in inputs_single.items():
+            inputs_single[k] = torch.tensor(v, dtype=torch.long)
+        # Create a new dataset containing only the input sample
+        single_input_dataset = SingleInputDataset(inputs_single)
+        # Create a DataLoader for the new dataset
+        single_input_loader = DataLoader(single_input_dataset,
+                                            batch_size=1,
+                                            shuffle=False,
+                                            num_workers=2)
+        # Perform inference on the single input
+        output = inference_fn(single_input_loader, model, device)
+        prediction = output.reshape((1, CFG.max_len))
+        char_probs = get_char_probs([context], prediction, tokenizer)
+        predictions = np.mean([char_probs], axis=0)
+        results = get_results(predictions, th=0.5)
+        print(results)
+        return get_text(context, results[0])
+    inputs = [gr.inputs.Textbox(label="Context Para", lines=10), gr.inputs.Textbox(label="Question", lines=1)]
+    output = gr.outputs.Textbox(label="Answer")
+    app = gr.Interface(fn=get_answer, inputs=inputs, outputs=output, allow_flagging='never')
+    app.launch()
+    print(get_answer(inputs[0], inputs[1]))

configuration.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ====================================================
+# CFG
+# ====================================================
+class CFG:
+    print_freq=100
+    num_workers=0
+    model="microsoft/deberta-base"
+    token="microsoft/deberta-base"
+    fc_dropout=0.2
+    max_len=739
+    weight_decay=0.01
+    project_folder = '/content/drive/MyDrive/Projects/Exigent/POC-V1/'
+    matching_data = 'matching_data.csv'

dataset.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from torch.utils.data import Dataset
+# Create a custom dataset class that takes a single input sample
+class SingleInputDataset(Dataset):
+    def __init__(self, input_single):
+        self.sample = input_single
+    def __len__(self):
+        return 1
+    def __getitem__(self, index):
+        return self.sample

model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel
+# ====================================================
+# Model
+# ====================================================
+class CustomModel(nn.Module):
+    def __init__(self, cfg, config_path=None, pretrained=False):
+        super().__init__()
+        self.cfg = cfg
+        if config_path is None:
+            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
+        else:
+            self.config = torch.load(config_path)
+        if pretrained:
+            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
+        else:
+            self.model = AutoModel.from_config(self.config)
+        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
+        self.fc = nn.Linear(self.config.hidden_size, 1)
+        self._init_weights(self.fc)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def feature(self, inputs):
+        outputs = self.model(**inputs)
+        last_hidden_states = outputs[0]
+        return last_hidden_states
+    def forward(self, inputs):
+        feature = self.feature(inputs)
+        output = self.fc(self.fc_dropout(feature))
+        return output

models_file/config.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44242dd46e256e33385a5be4979c8df941af4ae4d8ad5f2feb5315d114da5f98
+size 2541

models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:994ef334eed041e7b0d62f2ad3f97444adcac4696a8027a5b14bf803bb27265f
+size 555618276

models_file/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models_file/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models_file/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models_file/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "tokenizer_class": "DebertaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "vocab_type": "gpt2"
+}

models_file/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+numpy
+scikit-learn
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import itertools
+import torch
+import numpy as np
+from tqdm.auto import tqdm
+def get_char_probs(texts, predictions, tokenizer):
+    """
+    Maps prediction from encoded offset mapping to the text
+    Prediction = 466 sequence length * batch
+    text = 768 * batch
+    Using offset mapping [(0, 4), ] -- 466
+    creates results that is size of texts
+    for each text result[i]
+    result[0, 4] = pred[0] like wise for all
+    """
+    results = [np.zeros(len(t)) for t in texts]
+    for i, (text, prediction) in enumerate(zip(texts, predictions)):
+        encoded = tokenizer(text,
+                            add_special_tokens=True,
+                            return_offsets_mapping=True)
+        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
+            start = offset_mapping[0]
+            end = offset_mapping[1]
+            results[i][start:end] = pred
+    return results
+def get_results(char_probs, th=0.5):
+    """
+      Get the list of probabilites with size of text
+      And then get the index of the characters which are more than th
+      example:
+          char_prob = [0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.7, 0.7, 0.7] ## length == 766
+          where > 0.5 index ## [ 2,  3,  4,  5,  9, 10, 11]
+          Groupby same one -- [[2, 3, 4, 5], [9, 10, 11]]
+          And get the max and min and output the results
+    """
+    results = []
+    for char_prob in char_probs:
+        result = np.where(char_prob >= th)[0] + 1
+        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
+        result = [f"{min(r)} {max(r)}" for r in result]
+        result = ";".join(result)
+        results.append(result)
+    return results
+def get_predictions(results):
+    """
+      Will get the location, as a string, just like location in the df
+      results = ['2 5', '9 11']
+      loop through, split it and save it as start and end and store it in array
+    """
+    predictions = []
+    for result in results:
+        prediction = []
+        if result != "":
+            for loc in [s.split() for s in result.split(';')]:
+                start, end = int(loc[0]), int(loc[1])
+                prediction.append([start, end])
+        predictions.append(prediction)
+    return predictions
+def inference_fn(test_loader, model, device):
+    preds = []
+    model.eval()
+    model.to(device)
+    tk0 = tqdm(test_loader, total=len(test_loader))
+    for inputs in tk0:
+        for k, v in inputs.items():
+            inputs[k] = v.to(device)
+        with torch.no_grad():
+            y_preds = model(inputs)
+        preds.append(y_preds.sigmoid().numpy())
+    predictions = np.concatenate(preds)
+    return predictions
+def get_text(context, indexes):
+  if (indexes):
+      if ';' in indexes:
+        list_indexes = indexes.split(';')
+        answer = ''
+        for idx in list_indexes:
+          start_index = int(idx.split(' ')[0])
+          end_index = int(idx.split(' ')[1])
+          answer += ' '
+          answer += context[start_index:end_index]
+        return answer
+      else:
+        start_index = int(indexes.split(' ')[0])
+        end_index = int(indexes.split(' ')[1])
+        return context[start_index:end_index]
+  else:
+    return 'Not found in this Context'