Spaces:
Runtime error
Runtime error
Commit
·
08842a1
1
Parent(s):
97c21a9
chore: Update Hugging Face model and add text anonymization endpoint
Browse files- app.py +27 -1
- requirements.txt +2 -0
app.py
CHANGED
@@ -16,6 +16,7 @@ from transformers import pipeline
|
|
16 |
from fastapi.middleware.cors import CORSMiddleware
|
17 |
from presidio_analyzer import AnalyzerEngine
|
18 |
from presidio_anonymizer import AnonymizerEngine
|
|
|
19 |
|
20 |
from pydantic import BaseModel
|
21 |
from fastapi import FastAPI, Request, UploadFile, File
|
@@ -27,7 +28,9 @@ app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
|
|
27 |
# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
|
28 |
# model = genai.GenerativeModel('gemini-pro')
|
29 |
HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
|
30 |
-
#
|
|
|
|
|
31 |
|
32 |
app.add_middleware(
|
33 |
CORSMiddleware,
|
@@ -207,3 +210,26 @@ async def presidio_mask(text: TextItem):
|
|
207 |
seen_set.add((rec.start, rec.end))
|
208 |
return return_list
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
from fastapi.middleware.cors import CORSMiddleware
|
17 |
from presidio_analyzer import AnalyzerEngine
|
18 |
from presidio_anonymizer import AnonymizerEngine
|
19 |
+
from huggingface_hub import login
|
20 |
|
21 |
from pydantic import BaseModel
|
22 |
from fastapi import FastAPI, Request, UploadFile, File
|
|
|
28 |
# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
|
29 |
# model = genai.GenerativeModel('gemini-pro')
|
30 |
HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
|
31 |
+
# login(HUGGINGFACE_KEY)
|
32 |
+
|
33 |
+
pipe = pipeline("fill-mask", model="./ksp-mask-model")
|
34 |
|
35 |
app.add_middleware(
|
36 |
CORSMiddleware,
|
|
|
210 |
seen_set.add((rec.start, rec.end))
|
211 |
return return_list
|
212 |
|
213 |
+
|
214 |
+
@app.post("/anonymize_text")
|
215 |
+
async def anonymize_text(text: TextItem):
|
216 |
+
off_set = 0
|
217 |
+
mask_list = await presidio_mask(text)
|
218 |
+
mask_list = sorted(mask_list, key=lambda x: x["start"])
|
219 |
+
new_mask_list = []
|
220 |
+
text = text.text
|
221 |
+
anonymized_text = text
|
222 |
+
final_text = text
|
223 |
+
for mask in mask_list:
|
224 |
+
mask_text = anonymized_text[:mask["start"]] + "<mask>" + anonymized_text[mask["end"]:]
|
225 |
+
options = pipe(mask_text)
|
226 |
+
final_text = final_text[:mask["start"] + off_set] + options[0]["token_str"] + final_text[mask["end"] + off_set:]
|
227 |
+
new_mask_list.append({
|
228 |
+
"start": mask["start"] + off_set,
|
229 |
+
"end": mask["start"] + off_set + len(options[0]["token_str"]),
|
230 |
+
"entity_type": mask["entity_type"],
|
231 |
+
"options": options,
|
232 |
+
"original_text": mask["text"],
|
233 |
+
})
|
234 |
+
off_set += len(options[0]["token_str"]) - len(mask["text"])
|
235 |
+
return {"anonymized_text": final_text, "mask_list": new_mask_list}
|
requirements.txt
CHANGED
@@ -5,6 +5,8 @@ presidio-anonymizer==2.*
|
|
5 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
6 |
requests
|
7 |
uvicorn
|
|
|
|
|
8 |
transformers
|
9 |
python-dotenv
|
10 |
python-multipart
|
|
|
5 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
6 |
requests
|
7 |
uvicorn
|
8 |
+
tensorflow
|
9 |
+
tf-keras
|
10 |
transformers
|
11 |
python-dotenv
|
12 |
python-multipart
|