Spaces:
Runtime error
Runtime error
Commit
·
66e5e13
1
Parent(s):
bdec808
Add application file
Browse files- app.py +57 -0
- configuration.py +13 -0
- dataset.py +12 -0
- model.py +46 -0
- models_file/config.pth +3 -0
- models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth +3 -0
- models_file/tokenizer/merges.txt +0 -0
- models_file/tokenizer/special_tokens_map.json +51 -0
- models_file/tokenizer/tokenizer.json +0 -0
- models_file/tokenizer/tokenizer_config.json +66 -0
- models_file/tokenizer/vocab.json +0 -0
- requirements.txt +5 -0
- utils.py +104 -0
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if __name__ == '__main__':
|
2 |
+
inputs = ['gbjjhbdjhbdgjhdbfjhsdkjrkjf', 'fdjhbjhsbd']
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from model import CustomModel
|
5 |
+
import torch
|
6 |
+
from configuration import CFG
|
7 |
+
from dataset import SingleInputDataset
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
from utils import inference_fn, get_char_probs, get_results, get_text
|
10 |
+
import numpy as np
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
device = torch.device('cpu')
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained('models_file/tokenizer')
|
15 |
+
model = CustomModel(CFG, config_path='models_file\config.pth', pretrained=False)
|
16 |
+
state = torch.load('models_file\microsoft-deberta-base_0.9449373420387531_8_best.pth',
|
17 |
+
map_location=torch.device('cpu'))
|
18 |
+
model.load_state_dict(state['model'])
|
19 |
+
|
20 |
+
def get_answer(context, feature):
|
21 |
+
|
22 |
+
## Input to the model using patient-history and feature-text
|
23 |
+
inputs_single = tokenizer(context, feature,
|
24 |
+
add_special_tokens=True,
|
25 |
+
max_length=CFG.max_len,
|
26 |
+
padding="max_length",
|
27 |
+
return_offsets_mapping=False)
|
28 |
+
|
29 |
+
for k, v in inputs_single.items():
|
30 |
+
inputs_single[k] = torch.tensor(v, dtype=torch.long)
|
31 |
+
|
32 |
+
# Create a new dataset containing only the input sample
|
33 |
+
single_input_dataset = SingleInputDataset(inputs_single)
|
34 |
+
# Create a DataLoader for the new dataset
|
35 |
+
single_input_loader = DataLoader(single_input_dataset,
|
36 |
+
batch_size=1,
|
37 |
+
shuffle=False,
|
38 |
+
num_workers=2)
|
39 |
+
|
40 |
+
# Perform inference on the single input
|
41 |
+
output = inference_fn(single_input_loader, model, device)
|
42 |
+
|
43 |
+
prediction = output.reshape((1, CFG.max_len))
|
44 |
+
char_probs = get_char_probs([context], prediction, tokenizer)
|
45 |
+
predictions = np.mean([char_probs], axis=0)
|
46 |
+
results = get_results(predictions, th=0.5)
|
47 |
+
|
48 |
+
print(results)
|
49 |
+
return get_text(context, results[0])
|
50 |
+
|
51 |
+
inputs = [gr.inputs.Textbox(label="Context Para", lines=10), gr.inputs.Textbox(label="Question", lines=1)]
|
52 |
+
output = gr.outputs.Textbox(label="Answer")
|
53 |
+
|
54 |
+
app = gr.Interface(fn=get_answer, inputs=inputs, outputs=output, allow_flagging='never')
|
55 |
+
|
56 |
+
app.launch()
|
57 |
+
print(get_answer(inputs[0], inputs[1]))
|
configuration.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====================================================
|
2 |
+
# CFG
|
3 |
+
# ====================================================
|
4 |
+
class CFG:
|
5 |
+
print_freq=100
|
6 |
+
num_workers=0
|
7 |
+
model="microsoft/deberta-base"
|
8 |
+
token="microsoft/deberta-base"
|
9 |
+
fc_dropout=0.2
|
10 |
+
max_len=739
|
11 |
+
weight_decay=0.01
|
12 |
+
project_folder = '/content/drive/MyDrive/Projects/Exigent/POC-V1/'
|
13 |
+
matching_data = 'matching_data.csv'
|
dataset.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
|
3 |
+
# Create a custom dataset class that takes a single input sample
|
4 |
+
class SingleInputDataset(Dataset):
|
5 |
+
def __init__(self, input_single):
|
6 |
+
self.sample = input_single
|
7 |
+
|
8 |
+
def __len__(self):
|
9 |
+
return 1
|
10 |
+
|
11 |
+
def __getitem__(self, index):
|
12 |
+
return self.sample
|
model.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import AutoConfig, AutoModel
|
4 |
+
|
5 |
+
# ====================================================
|
6 |
+
# Model
|
7 |
+
# ====================================================
|
8 |
+
class CustomModel(nn.Module):
|
9 |
+
def __init__(self, cfg, config_path=None, pretrained=False):
|
10 |
+
super().__init__()
|
11 |
+
self.cfg = cfg
|
12 |
+
|
13 |
+
if config_path is None:
|
14 |
+
self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
|
15 |
+
else:
|
16 |
+
self.config = torch.load(config_path)
|
17 |
+
if pretrained:
|
18 |
+
self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
|
19 |
+
else:
|
20 |
+
self.model = AutoModel.from_config(self.config)
|
21 |
+
self.fc_dropout = nn.Dropout(cfg.fc_dropout)
|
22 |
+
self.fc = nn.Linear(self.config.hidden_size, 1)
|
23 |
+
self._init_weights(self.fc)
|
24 |
+
|
25 |
+
def _init_weights(self, module):
|
26 |
+
if isinstance(module, nn.Linear):
|
27 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
28 |
+
if module.bias is not None:
|
29 |
+
module.bias.data.zero_()
|
30 |
+
elif isinstance(module, nn.Embedding):
|
31 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
32 |
+
if module.padding_idx is not None:
|
33 |
+
module.weight.data[module.padding_idx].zero_()
|
34 |
+
elif isinstance(module, nn.LayerNorm):
|
35 |
+
module.bias.data.zero_()
|
36 |
+
module.weight.data.fill_(1.0)
|
37 |
+
|
38 |
+
def feature(self, inputs):
|
39 |
+
outputs = self.model(**inputs)
|
40 |
+
last_hidden_states = outputs[0]
|
41 |
+
return last_hidden_states
|
42 |
+
|
43 |
+
def forward(self, inputs):
|
44 |
+
feature = self.feature(inputs)
|
45 |
+
output = self.fc(self.fc_dropout(feature))
|
46 |
+
return output
|
models_file/config.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44242dd46e256e33385a5be4979c8df941af4ae4d8ad5f2feb5315d114da5f98
|
3 |
+
size 2541
|
models_file/microsoft-deberta-base_0.9449373420387531_8_best.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:994ef334eed041e7b0d62f2ad3f97444adcac4696a8027a5b14bf803bb27265f
|
3 |
+
size 555618276
|
models_file/tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models_file/tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "[CLS]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "[SEP]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "[MASK]",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "[PAD]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "[SEP]",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "[UNK]",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": true,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
models_file/tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models_file/tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"bos_token": {
|
5 |
+
"__type": "AddedToken",
|
6 |
+
"content": "[CLS]",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"clean_up_tokenization_spaces": true,
|
13 |
+
"cls_token": {
|
14 |
+
"__type": "AddedToken",
|
15 |
+
"content": "[CLS]",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": true,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false
|
20 |
+
},
|
21 |
+
"do_lower_case": false,
|
22 |
+
"eos_token": {
|
23 |
+
"__type": "AddedToken",
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"errors": "replace",
|
31 |
+
"mask_token": {
|
32 |
+
"__type": "AddedToken",
|
33 |
+
"content": "[MASK]",
|
34 |
+
"lstrip": true,
|
35 |
+
"normalized": true,
|
36 |
+
"rstrip": false,
|
37 |
+
"single_word": false
|
38 |
+
},
|
39 |
+
"model_max_length": 512,
|
40 |
+
"pad_token": {
|
41 |
+
"__type": "AddedToken",
|
42 |
+
"content": "[PAD]",
|
43 |
+
"lstrip": false,
|
44 |
+
"normalized": true,
|
45 |
+
"rstrip": false,
|
46 |
+
"single_word": false
|
47 |
+
},
|
48 |
+
"sep_token": {
|
49 |
+
"__type": "AddedToken",
|
50 |
+
"content": "[SEP]",
|
51 |
+
"lstrip": false,
|
52 |
+
"normalized": true,
|
53 |
+
"rstrip": false,
|
54 |
+
"single_word": false
|
55 |
+
},
|
56 |
+
"tokenizer_class": "DebertaTokenizer",
|
57 |
+
"unk_token": {
|
58 |
+
"__type": "AddedToken",
|
59 |
+
"content": "[UNK]",
|
60 |
+
"lstrip": false,
|
61 |
+
"normalized": true,
|
62 |
+
"rstrip": false,
|
63 |
+
"single_word": false
|
64 |
+
},
|
65 |
+
"vocab_type": "gpt2"
|
66 |
+
}
|
models_file/tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
numpy
|
4 |
+
scikit-learn
|
5 |
+
gradio
|
utils.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from tqdm.auto import tqdm
|
5 |
+
|
6 |
+
def get_char_probs(texts, predictions, tokenizer):
|
7 |
+
"""
|
8 |
+
Maps prediction from encoded offset mapping to the text
|
9 |
+
|
10 |
+
Prediction = 466 sequence length * batch
|
11 |
+
text = 768 * batch
|
12 |
+
Using offset mapping [(0, 4), ] -- 466
|
13 |
+
|
14 |
+
creates results that is size of texts
|
15 |
+
|
16 |
+
for each text result[i]
|
17 |
+
result[0, 4] = pred[0] like wise for all
|
18 |
+
|
19 |
+
"""
|
20 |
+
results = [np.zeros(len(t)) for t in texts]
|
21 |
+
for i, (text, prediction) in enumerate(zip(texts, predictions)):
|
22 |
+
encoded = tokenizer(text,
|
23 |
+
add_special_tokens=True,
|
24 |
+
return_offsets_mapping=True)
|
25 |
+
for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
|
26 |
+
start = offset_mapping[0]
|
27 |
+
end = offset_mapping[1]
|
28 |
+
results[i][start:end] = pred
|
29 |
+
return results
|
30 |
+
|
31 |
+
|
32 |
+
def get_results(char_probs, th=0.5):
|
33 |
+
"""
|
34 |
+
Get the list of probabilites with size of text
|
35 |
+
And then get the index of the characters which are more than th
|
36 |
+
example:
|
37 |
+
char_prob = [0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.7, 0.7, 0.7] ## length == 766
|
38 |
+
where > 0.5 index ## [ 2, 3, 4, 5, 9, 10, 11]
|
39 |
+
|
40 |
+
Groupby same one -- [[2, 3, 4, 5], [9, 10, 11]]
|
41 |
+
And get the max and min and output the results
|
42 |
+
|
43 |
+
"""
|
44 |
+
results = []
|
45 |
+
for char_prob in char_probs:
|
46 |
+
result = np.where(char_prob >= th)[0] + 1
|
47 |
+
result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
|
48 |
+
result = [f"{min(r)} {max(r)}" for r in result]
|
49 |
+
result = ";".join(result)
|
50 |
+
results.append(result)
|
51 |
+
return results
|
52 |
+
|
53 |
+
|
54 |
+
def get_predictions(results):
|
55 |
+
"""
|
56 |
+
Will get the location, as a string, just like location in the df
|
57 |
+
results = ['2 5', '9 11']
|
58 |
+
|
59 |
+
loop through, split it and save it as start and end and store it in array
|
60 |
+
"""
|
61 |
+
predictions = []
|
62 |
+
for result in results:
|
63 |
+
prediction = []
|
64 |
+
if result != "":
|
65 |
+
for loc in [s.split() for s in result.split(';')]:
|
66 |
+
start, end = int(loc[0]), int(loc[1])
|
67 |
+
prediction.append([start, end])
|
68 |
+
predictions.append(prediction)
|
69 |
+
return predictions
|
70 |
+
|
71 |
+
def inference_fn(test_loader, model, device):
|
72 |
+
preds = []
|
73 |
+
model.eval()
|
74 |
+
model.to(device)
|
75 |
+
tk0 = tqdm(test_loader, total=len(test_loader))
|
76 |
+
for inputs in tk0:
|
77 |
+
for k, v in inputs.items():
|
78 |
+
inputs[k] = v.to(device)
|
79 |
+
with torch.no_grad():
|
80 |
+
y_preds = model(inputs)
|
81 |
+
preds.append(y_preds.sigmoid().numpy())
|
82 |
+
predictions = np.concatenate(preds)
|
83 |
+
return predictions
|
84 |
+
|
85 |
+
def get_text(context, indexes):
|
86 |
+
if (indexes):
|
87 |
+
if ';' in indexes:
|
88 |
+
list_indexes = indexes.split(';')
|
89 |
+
|
90 |
+
answer = ''
|
91 |
+
for idx in list_indexes:
|
92 |
+
start_index = int(idx.split(' ')[0])
|
93 |
+
end_index = int(idx.split(' ')[1])
|
94 |
+
answer += ' '
|
95 |
+
answer += context[start_index:end_index]
|
96 |
+
return answer
|
97 |
+
else:
|
98 |
+
start_index = int(indexes.split(' ')[0])
|
99 |
+
end_index = int(indexes.split(' ')[1])
|
100 |
+
|
101 |
+
return context[start_index:end_index]
|
102 |
+
else:
|
103 |
+
return 'Not found in this Context'
|
104 |
+
|