Spaces:
Sleeping
Sleeping
Mengmeng Liu
commited on
Commit
·
a5b8812
1
Parent(s):
bbea195
initial commit
Browse files- app.py +48 -0
- load_model.py +13 -0
- models/deepset/tinyroberta-squad/config.json +30 -0
- models/deepset/tinyroberta-squad/merges.txt +0 -0
- models/deepset/tinyroberta-squad/pytorch_model.bin +3 -0
- models/deepset/tinyroberta-squad/special_tokens_map.json +15 -0
- models/deepset/tinyroberta-squad/tokenizer.json +0 -0
- models/deepset/tinyroberta-squad/tokenizer_config.json +15 -0
- models/deepset/tinyroberta-squad/training_args.bin +3 -0
- models/deepset/tinyroberta-squad/vocab.json +0 -0
- test.py +30 -0
- test_text.txt +4 -0
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from io import StringIO
|
5 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
6 |
+
from transformers import Trainer
|
7 |
+
import torch
|
8 |
+
|
9 |
+
st.title('HRA Document QA')
|
10 |
+
|
11 |
+
file_name = st.file_uploader("Upload the document that you want to ask questions")
|
12 |
+
|
13 |
+
if file_name is not None:
|
14 |
+
text = file_name.getvalue()
|
15 |
+
|
16 |
+
|
17 |
+
stringio = StringIO(file_name.getvalue().decode("utf-8"))
|
18 |
+
|
19 |
+
context = stringio.read()
|
20 |
+
|
21 |
+
|
22 |
+
question = st.chat_input("Ask some questions about this document")
|
23 |
+
with st.chat_message("user"):
|
24 |
+
st.write("Hello 👋 I am an HRA chatbot~")
|
25 |
+
st.write("Here's the document that you uploaded:")
|
26 |
+
st.write(context)
|
27 |
+
|
28 |
+
if question:
|
29 |
+
st.write("You asked a question:")
|
30 |
+
st.write(question)
|
31 |
+
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained("./models/deepset/tinyroberta-squad")
|
33 |
+
model = AutoModelForQuestionAnswering.from_pretrained("./models/deepset/tinyroberta-squad")
|
34 |
+
|
35 |
+
inputs = tokenizer(question, context, return_tensors="pt")
|
36 |
+
with torch.no_grad():
|
37 |
+
outputs = model(**inputs)
|
38 |
+
|
39 |
+
answer_start_index = outputs.start_logits.argmax()
|
40 |
+
answer_end_index = outputs.end_logits.argmax()
|
41 |
+
|
42 |
+
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
|
43 |
+
st.write("Answer:")
|
44 |
+
st.write(tokenizer.decode(predict_answer_tokens, skip_special_tokens=True))
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
load_model.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load model directly
|
2 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
3 |
+
from transformers import Trainer
|
4 |
+
import torch
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("deepset/tinyroberta-squad2")
|
7 |
+
|
8 |
+
model = AutoModelForQuestionAnswering.from_pretrained("deepset/tinyroberta-squad2")
|
9 |
+
|
10 |
+
trainer = Trainer(model=model, tokenizer=tokenizer)
|
11 |
+
|
12 |
+
trainer.save_model("./models/deepset/tinyroberta-squad")
|
13 |
+
print("sucessfully saved model")
|
models/deepset/tinyroberta-squad/config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "deepset/tinyroberta-squad2",
|
3 |
+
"architectures": [
|
4 |
+
"RobertaForQuestionAnswering"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"gradient_checkpointing": false,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout_prob": 0.1,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"language": "english",
|
17 |
+
"layer_norm_eps": 1e-05,
|
18 |
+
"max_position_embeddings": 514,
|
19 |
+
"model_type": "roberta",
|
20 |
+
"name": "Roberta",
|
21 |
+
"num_attention_heads": 12,
|
22 |
+
"num_hidden_layers": 6,
|
23 |
+
"pad_token_id": 1,
|
24 |
+
"position_embedding_type": "absolute",
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.30.2",
|
27 |
+
"type_vocab_size": 1,
|
28 |
+
"use_cache": true,
|
29 |
+
"vocab_size": 50265
|
30 |
+
}
|
models/deepset/tinyroberta-squad/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/deepset/tinyroberta-squad/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3216bcdc78b3c899a482179b996f48da35fee3a654aa55422597315e84f180f3
|
3 |
+
size 326155437
|
models/deepset/tinyroberta-squad/special_tokens_map.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"cls_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"mask_token": {
|
6 |
+
"content": "<mask>",
|
7 |
+
"lstrip": true,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"pad_token": "<pad>",
|
13 |
+
"sep_token": "</s>",
|
14 |
+
"unk_token": "<unk>"
|
15 |
+
}
|
models/deepset/tinyroberta-squad/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/deepset/tinyroberta-squad/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<s>",
|
4 |
+
"clean_up_tokenization_spaces": true,
|
5 |
+
"cls_token": "<s>",
|
6 |
+
"eos_token": "</s>",
|
7 |
+
"errors": "replace",
|
8 |
+
"mask_token": "<mask>",
|
9 |
+
"model_max_length": 512,
|
10 |
+
"pad_token": "<pad>",
|
11 |
+
"sep_token": "</s>",
|
12 |
+
"tokenizer_class": "RobertaTokenizer",
|
13 |
+
"trim_offsets": true,
|
14 |
+
"unk_token": "<unk>"
|
15 |
+
}
|
models/deepset/tinyroberta-squad/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:569ee3bdcc22004d6ba63b65a3d195d9f3033a90b386bae47e1edbf619acf483
|
3 |
+
size 3899
|
models/deepset/tinyroberta-squad/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load model directly
|
2 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
3 |
+
from transformers import Trainer
|
4 |
+
import torch
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("./models/deepset/tinyroberta-squad")
|
7 |
+
|
8 |
+
model = AutoModelForQuestionAnswering.from_pretrained("./models/deepset/tinyroberta-squad")
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
question, text = "Where did robert graduate?", "Robert A. Kauffman is president of Healthcare Risk Advisors (HRA), leading the expansion of the company’s self-insurance and risk transfer solutions for large medical practices, hospitals, and health systems. Rob previously served as senior vice president, secretary, and general counsel of FOJP Service Corporation (“FOJP”) and Hospitals Insurance Company (“HIC”).Rob has built a distinguished career in insurance and risk management. Prior to his roles at FOJP and HIC, he was senior vice president, secretary, general counsel, and chief compliance officer at Harleysville Insurance. He was also a partner at Reed Smith, an international law firm specializing in complex litigation, strategic transactions, and regulatory matters.In addition to his private sector experience, Rob served with distinction as an Assistant U.S. Attorney in the Criminal Division of the United States Attorney’s Office for the Eastern District of Pennsylvania.Mr. Kauffman earned his Bachelor of Arts and Juris Doctor degrees from the University of Pennsylvania."
|
13 |
+
|
14 |
+
inputs = tokenizer(question, text, return_tensors="pt")
|
15 |
+
with torch.no_grad():
|
16 |
+
outputs = model(**inputs)
|
17 |
+
|
18 |
+
answer_start_index = outputs.start_logits.argmax()
|
19 |
+
answer_end_index = outputs.end_logits.argmax()
|
20 |
+
|
21 |
+
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
|
22 |
+
print(tokenizer.decode(predict_answer_tokens, skip_special_tokens=True))
|
23 |
+
|
24 |
+
# target is "nice puppet"
|
25 |
+
# target_start_index = torch.tensor([14])
|
26 |
+
# target_end_index = torch.tensor([15])
|
27 |
+
|
28 |
+
# outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
|
29 |
+
# loss = outputs.loss
|
30 |
+
# round(loss.item(), 2)
|
test_text.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Robert A. Kauffman is president of Healthcare Risk Advisors (HRA), leading the expansion of the company’s self-insurance and risk transfer solutions for large medical practices, hospitals, and health systems. Rob previously served as senior vice president, secretary, and general counsel of FOJP Service Corporation (“FOJP”) and Hospitals Insurance Company (“HIC”).
|
2 |
+
Rob has built a distinguished career in insurance and risk management. Prior to his roles at FOJP and HIC, he was senior vice president, secretary, general counsel, and chief compliance officer at Harleysville Insurance. He was also a partner at Reed Smith, an international law firm specializing in complex litigation, strategic transactions, and regulatory matters.
|
3 |
+
In addition to his private sector experience, Rob served with distinction as an Assistant U.S. Attorney in the Criminal Division of the United States Attorney’s Office for the Eastern District of Pennsylvania.
|
4 |
+
Mr. Kauffman earned his Bachelor of Arts and Juris Doctor degrees from the University of Pennsylvania.
|