Spaces:
Runtime error
Runtime error
first commit
Browse files- .dockerignore +20 -0
- .gitignore +160 -0
- README.md +5 -5
- app.py +70 -0
- configuration.py +13 -0
- dataset.py +12 -0
- model.py +46 -0
- models_file/config.pth +3 -0
- models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth +3 -0
- models_file/tokenizer/merges.txt +0 -0
- models_file/tokenizer/special_tokens_map.json +51 -0
- models_file/tokenizer/tokenizer.json +0 -0
- models_file/tokenizer/tokenizer_config.json +66 -0
- models_file/tokenizer/vocab.json +0 -0
- requirements.txt +5 -0
- utils.py +104 -0
.dockerignore
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
.Python
|
6 |
+
env
|
7 |
+
pip-log.txt
|
8 |
+
pip-delete-this-directory.txt
|
9 |
+
.tox
|
10 |
+
.coverage
|
11 |
+
.coverage.*
|
12 |
+
.cache
|
13 |
+
nosetests.xml
|
14 |
+
coverage.xml
|
15 |
+
*.cover
|
16 |
+
*.log
|
17 |
+
.git
|
18 |
+
.mypy_cache
|
19 |
+
.pytest_cache
|
20 |
+
.hypothesis
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
+
title: Entity Extraction
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.28.3
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if __name__ == '__main__':
|
2 |
+
inputs = ['gbjjhbdjhbdgjhdbfjhsdkjrkjf', 'fdjhbjhsbd']
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from model import CustomModel
|
5 |
+
import torch
|
6 |
+
from configuration import CFG
|
7 |
+
from dataset import SingleInputDataset
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
from utils import inference_fn, get_char_probs, get_results, get_text
|
10 |
+
import numpy as np
|
11 |
+
import gradio as gr
|
12 |
+
import os
|
13 |
+
|
14 |
+
device = torch.device('cpu')
|
15 |
+
config_path = os.path.join('models_file', 'config.pth')
|
16 |
+
model_path = os.path.join('models_file', 'microsoft-deberta-base_0.9449373420387531_8_best.pth')
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained('models_file/tokenizer')
|
18 |
+
model = CustomModel(CFG, config_path=config_path, pretrained=False)
|
19 |
+
state = torch.load(model_path,
|
20 |
+
map_location=torch.device('cpu'))
|
21 |
+
model.load_state_dict(state['model'])
|
22 |
+
|
23 |
+
def get_answer(context, feature):
|
24 |
+
|
25 |
+
## Input to the model using patient-history and feature-text
|
26 |
+
inputs_single = tokenizer(context, feature,
|
27 |
+
add_special_tokens=True,
|
28 |
+
max_length=CFG.max_len,
|
29 |
+
padding="max_length",
|
30 |
+
return_offsets_mapping=False)
|
31 |
+
|
32 |
+
for k, v in inputs_single.items():
|
33 |
+
inputs_single[k] = torch.tensor(v, dtype=torch.long)
|
34 |
+
|
35 |
+
# Create a new dataset containing only the input sample
|
36 |
+
single_input_dataset = SingleInputDataset(inputs_single)
|
37 |
+
# Create a DataLoader for the new dataset
|
38 |
+
single_input_loader = DataLoader(
|
39 |
+
single_input_dataset,
|
40 |
+
batch_size=1,
|
41 |
+
shuffle=False,
|
42 |
+
num_workers=2
|
43 |
+
)
|
44 |
+
|
45 |
+
# Perform inference on the single input
|
46 |
+
output = inference_fn(single_input_loader, model, device)
|
47 |
+
|
48 |
+
prediction = output.reshape((1, CFG.max_len))
|
49 |
+
char_probs = get_char_probs([context], prediction, tokenizer)
|
50 |
+
predictions = np.mean([char_probs], axis=0)
|
51 |
+
results = get_results(predictions, th=0.5)
|
52 |
+
|
53 |
+
print(results)
|
54 |
+
return get_text(context, results[0])
|
55 |
+
|
56 |
+
inputs = [gr.inputs.Textbox(label="Context Para", lines=10), gr.inputs.Textbox(label="Question", lines=1)]
|
57 |
+
output = gr.outputs.Textbox(label="Answer")
|
58 |
+
article = "<p style='text-align: center'><a href='https://www.xelpmoc.in/' target='_blank'>Made by Xelpmoc</a></p>"
|
59 |
+
|
60 |
+
app = gr.Interface(
|
61 |
+
fn=get_answer,
|
62 |
+
inputs=inputs,
|
63 |
+
outputs=output,
|
64 |
+
allow_flagging='never',
|
65 |
+
title="Entity Extraction from Text",
|
66 |
+
article=article,
|
67 |
+
enable_queue=True,
|
68 |
+
cache_examples=False)
|
69 |
+
|
70 |
+
app.launch()
|
configuration.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====================================================
|
2 |
+
# CFG
|
3 |
+
# ====================================================
|
4 |
+
class CFG:
|
5 |
+
print_freq=100
|
6 |
+
num_workers=0
|
7 |
+
model="microsoft/deberta-base"
|
8 |
+
token="microsoft/deberta-base"
|
9 |
+
fc_dropout=0.2
|
10 |
+
max_len=739
|
11 |
+
weight_decay=0.01
|
12 |
+
project_folder = '/content/drive/MyDrive/Projects/Exigent/POC-V1/'
|
13 |
+
matching_data = 'matching_data.csv'
|
dataset.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
|
3 |
+
# Create a custom dataset class that takes a single input sample
|
4 |
+
class SingleInputDataset(Dataset):
|
5 |
+
def __init__(self, input_single):
|
6 |
+
self.sample = input_single
|
7 |
+
|
8 |
+
def __len__(self):
|
9 |
+
return 1
|
10 |
+
|
11 |
+
def __getitem__(self, index):
|
12 |
+
return self.sample
|
model.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import AutoConfig, AutoModel
|
4 |
+
|
5 |
+
# ====================================================
|
6 |
+
# Model
|
7 |
+
# ====================================================
|
8 |
+
class CustomModel(nn.Module):
|
9 |
+
def __init__(self, cfg, config_path=None, pretrained=False):
|
10 |
+
super().__init__()
|
11 |
+
self.cfg = cfg
|
12 |
+
|
13 |
+
if config_path is None:
|
14 |
+
self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
|
15 |
+
else:
|
16 |
+
self.config = torch.load(config_path)
|
17 |
+
if pretrained:
|
18 |
+
self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
|
19 |
+
else:
|
20 |
+
self.model = AutoModel.from_config(self.config)
|
21 |
+
self.fc_dropout = nn.Dropout(cfg.fc_dropout)
|
22 |
+
self.fc = nn.Linear(self.config.hidden_size, 1)
|
23 |
+
self._init_weights(self.fc)
|
24 |
+
|
25 |
+
def _init_weights(self, module):
|
26 |
+
if isinstance(module, nn.Linear):
|
27 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
28 |
+
if module.bias is not None:
|
29 |
+
module.bias.data.zero_()
|
30 |
+
elif isinstance(module, nn.Embedding):
|
31 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
32 |
+
if module.padding_idx is not None:
|
33 |
+
module.weight.data[module.padding_idx].zero_()
|
34 |
+
elif isinstance(module, nn.LayerNorm):
|
35 |
+
module.bias.data.zero_()
|
36 |
+
module.weight.data.fill_(1.0)
|
37 |
+
|
38 |
+
def feature(self, inputs):
|
39 |
+
outputs = self.model(**inputs)
|
40 |
+
last_hidden_states = outputs[0]
|
41 |
+
return last_hidden_states
|
42 |
+
|
43 |
+
def forward(self, inputs):
|
44 |
+
feature = self.feature(inputs)
|
45 |
+
output = self.fc(self.fc_dropout(feature))
|
46 |
+
return output
|
models_file/config.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44242dd46e256e33385a5be4979c8df941af4ae4d8ad5f2feb5315d114da5f98
|
3 |
+
size 2541
|
models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:994ef334eed041e7b0d62f2ad3f97444adcac4696a8027a5b14bf803bb27265f
|
3 |
+
size 555618276
|
models_file/tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models_file/tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "[CLS]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "[SEP]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "[MASK]",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "[PAD]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "[SEP]",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "[UNK]",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": true,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
models_file/tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models_file/tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"bos_token": {
|
5 |
+
"__type": "AddedToken",
|
6 |
+
"content": "[CLS]",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"clean_up_tokenization_spaces": true,
|
13 |
+
"cls_token": {
|
14 |
+
"__type": "AddedToken",
|
15 |
+
"content": "[CLS]",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": true,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false
|
20 |
+
},
|
21 |
+
"do_lower_case": false,
|
22 |
+
"eos_token": {
|
23 |
+
"__type": "AddedToken",
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"errors": "replace",
|
31 |
+
"mask_token": {
|
32 |
+
"__type": "AddedToken",
|
33 |
+
"content": "[MASK]",
|
34 |
+
"lstrip": true,
|
35 |
+
"normalized": true,
|
36 |
+
"rstrip": false,
|
37 |
+
"single_word": false
|
38 |
+
},
|
39 |
+
"model_max_length": 512,
|
40 |
+
"pad_token": {
|
41 |
+
"__type": "AddedToken",
|
42 |
+
"content": "[PAD]",
|
43 |
+
"lstrip": false,
|
44 |
+
"normalized": true,
|
45 |
+
"rstrip": false,
|
46 |
+
"single_word": false
|
47 |
+
},
|
48 |
+
"sep_token": {
|
49 |
+
"__type": "AddedToken",
|
50 |
+
"content": "[SEP]",
|
51 |
+
"lstrip": false,
|
52 |
+
"normalized": true,
|
53 |
+
"rstrip": false,
|
54 |
+
"single_word": false
|
55 |
+
},
|
56 |
+
"tokenizer_class": "DebertaTokenizer",
|
57 |
+
"unk_token": {
|
58 |
+
"__type": "AddedToken",
|
59 |
+
"content": "[UNK]",
|
60 |
+
"lstrip": false,
|
61 |
+
"normalized": true,
|
62 |
+
"rstrip": false,
|
63 |
+
"single_word": false
|
64 |
+
},
|
65 |
+
"vocab_type": "gpt2"
|
66 |
+
}
|
models_file/tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
numpy
|
4 |
+
scikit-learn
|
5 |
+
gradio
|
utils.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from tqdm.auto import tqdm
|
5 |
+
|
6 |
+
def get_char_probs(texts, predictions, tokenizer):
|
7 |
+
"""
|
8 |
+
Maps prediction from encoded offset mapping to the text
|
9 |
+
|
10 |
+
Prediction = 466 sequence length * batch
|
11 |
+
text = 768 * batch
|
12 |
+
Using offset mapping [(0, 4), ] -- 466
|
13 |
+
|
14 |
+
creates results that is size of texts
|
15 |
+
|
16 |
+
for each text result[i]
|
17 |
+
result[0, 4] = pred[0] like wise for all
|
18 |
+
|
19 |
+
"""
|
20 |
+
results = [np.zeros(len(t)) for t in texts]
|
21 |
+
for i, (text, prediction) in enumerate(zip(texts, predictions)):
|
22 |
+
encoded = tokenizer(text,
|
23 |
+
add_special_tokens=True,
|
24 |
+
return_offsets_mapping=True)
|
25 |
+
for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
|
26 |
+
start = offset_mapping[0]
|
27 |
+
end = offset_mapping[1]
|
28 |
+
results[i][start:end] = pred
|
29 |
+
return results
|
30 |
+
|
31 |
+
|
32 |
+
def get_results(char_probs, th=0.5):
|
33 |
+
"""
|
34 |
+
Get the list of probabilites with size of text
|
35 |
+
And then get the index of the characters which are more than th
|
36 |
+
example:
|
37 |
+
char_prob = [0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.7, 0.7, 0.7] ## length == 766
|
38 |
+
where > 0.5 index ## [ 2, 3, 4, 5, 9, 10, 11]
|
39 |
+
|
40 |
+
Groupby same one -- [[2, 3, 4, 5], [9, 10, 11]]
|
41 |
+
And get the max and min and output the results
|
42 |
+
|
43 |
+
"""
|
44 |
+
results = []
|
45 |
+
for char_prob in char_probs:
|
46 |
+
result = np.where(char_prob >= th)[0] + 1
|
47 |
+
result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
|
48 |
+
result = [f"{min(r)} {max(r)}" for r in result]
|
49 |
+
result = ";".join(result)
|
50 |
+
results.append(result)
|
51 |
+
return results
|
52 |
+
|
53 |
+
|
54 |
+
def get_predictions(results):
|
55 |
+
"""
|
56 |
+
Will get the location, as a string, just like location in the df
|
57 |
+
results = ['2 5', '9 11']
|
58 |
+
|
59 |
+
loop through, split it and save it as start and end and store it in array
|
60 |
+
"""
|
61 |
+
predictions = []
|
62 |
+
for result in results:
|
63 |
+
prediction = []
|
64 |
+
if result != "":
|
65 |
+
for loc in [s.split() for s in result.split(';')]:
|
66 |
+
start, end = int(loc[0]), int(loc[1])
|
67 |
+
prediction.append([start, end])
|
68 |
+
predictions.append(prediction)
|
69 |
+
return predictions
|
70 |
+
|
71 |
+
def inference_fn(test_loader, model, device):
|
72 |
+
preds = []
|
73 |
+
model.eval()
|
74 |
+
model.to(device)
|
75 |
+
tk0 = tqdm(test_loader, total=len(test_loader))
|
76 |
+
for inputs in tk0:
|
77 |
+
for k, v in inputs.items():
|
78 |
+
inputs[k] = v.to(device)
|
79 |
+
with torch.no_grad():
|
80 |
+
y_preds = model(inputs)
|
81 |
+
preds.append(y_preds.sigmoid().numpy())
|
82 |
+
predictions = np.concatenate(preds)
|
83 |
+
return predictions
|
84 |
+
|
85 |
+
def get_text(context, indexes):
|
86 |
+
if (indexes):
|
87 |
+
if ';' in indexes:
|
88 |
+
list_indexes = indexes.split(';')
|
89 |
+
|
90 |
+
answer = ''
|
91 |
+
for idx in list_indexes:
|
92 |
+
start_index = int(idx.split(' ')[0])
|
93 |
+
end_index = int(idx.split(' ')[1])
|
94 |
+
answer += ' '
|
95 |
+
answer += context[start_index:end_index]
|
96 |
+
return answer
|
97 |
+
else:
|
98 |
+
start_index = int(indexes.split(' ')[0])
|
99 |
+
end_index = int(indexes.split(' ')[1])
|
100 |
+
|
101 |
+
return context[start_index:end_index]
|
102 |
+
else:
|
103 |
+
return 'Not found in this Context'
|
104 |
+
|