liamcripwell
commited on
Commit
•
f9b15f3
1
Parent(s):
590025b
Update README.md
Browse files
README.md
CHANGED
@@ -5,12 +5,12 @@ language:
|
|
5 |
---
|
6 |
# Structure Extraction Model by NuMind 🔥
|
7 |
|
8 |
-
NuExtract is a
|
9 |
-
To use the model, provide an input text (less than 2000 tokens) and a JSON
|
10 |
|
11 |
-
Note: This model is purely extractive, so
|
12 |
|
13 |
-
|
14 |
|
15 |
We also provide a tiny(0.5B) and large(7B) version of this model: [NuExtract-tiny](https://huggingface.co/numind/NuExtract-tiny) and [NuExtract-large](https://huggingface.co/numind/NuExtract-large)
|
16 |
|
@@ -44,7 +44,7 @@ import json
|
|
44 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
45 |
|
46 |
|
47 |
-
def predict_NuExtract(model,tokenizer,text, schema,example
|
48 |
schema = json.dumps(json.loads(schema), indent=4)
|
49 |
input_llm = "<|input|>\n### Template:\n" + schema + "\n"
|
50 |
for i in example:
|
@@ -52,13 +52,14 @@ def predict_NuExtract(model,tokenizer,text, schema,example = ["","",""]):
|
|
52 |
input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
|
53 |
|
54 |
input_llm += "### Text:\n"+text +"\n<|output|>\n"
|
55 |
-
input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length
|
56 |
|
57 |
output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
|
58 |
return output.split("<|output|>")[1].split("<|end-output|>")[0]
|
59 |
|
60 |
|
61 |
-
|
|
|
62 |
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract", trust_remote_code=True)
|
63 |
|
64 |
model.to("cuda")
|
@@ -90,7 +91,7 @@ schema = """{
|
|
90 |
}
|
91 |
}"""
|
92 |
|
93 |
-
prediction = predict_NuExtract(model,tokenizer,text, schema,example
|
94 |
print(prediction)
|
95 |
|
96 |
```
|
|
|
5 |
---
|
6 |
# Structure Extraction Model by NuMind 🔥
|
7 |
|
8 |
+
NuExtract is a version of [phi-3-mini](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), fine-tuned on a private high-quality synthetic dataset for information extraction.
|
9 |
+
To use the model, provide an input text (less than 2000 tokens) and a JSON template describing the information you need to extract.
|
10 |
|
11 |
+
Note: This model is purely extractive, so all text output by the model is present as is in the original text. You can also provide an example of output formatting to help the model understand your task more precisely.
|
12 |
|
13 |
+
Try it here: https://huggingface.co/spaces/numind/NuExtract
|
14 |
|
15 |
We also provide a tiny(0.5B) and large(7B) version of this model: [NuExtract-tiny](https://huggingface.co/numind/NuExtract-tiny) and [NuExtract-large](https://huggingface.co/numind/NuExtract-large)
|
16 |
|
|
|
44 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
45 |
|
46 |
|
47 |
+
def predict_NuExtract(model, tokenizer, text, schema, example=["", "", ""]):
|
48 |
schema = json.dumps(json.loads(schema), indent=4)
|
49 |
input_llm = "<|input|>\n### Template:\n" + schema + "\n"
|
50 |
for i in example:
|
|
|
52 |
input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
|
53 |
|
54 |
input_llm += "### Text:\n"+text +"\n<|output|>\n"
|
55 |
+
input_ids = tokenizer(input_llm, return_tensors="pt",truncation = True, max_length=4000).to("cuda")
|
56 |
|
57 |
output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
|
58 |
return output.split("<|output|>")[1].split("<|end-output|>")[0]
|
59 |
|
60 |
|
61 |
+
# We recommend using bf16 as it results in negligable performance loss
|
62 |
+
model = AutoModelForCausalLM.from_pretrained("numind/NuExtract", torch_dtype=torch.bfloat16, trust_remote_code=True)
|
63 |
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract", trust_remote_code=True)
|
64 |
|
65 |
model.to("cuda")
|
|
|
91 |
}
|
92 |
}"""
|
93 |
|
94 |
+
prediction = predict_NuExtract(model, tokenizer, text, schema, example=["","",""])
|
95 |
print(prediction)
|
96 |
|
97 |
```
|