pranavraj1103 commited on
Commit
08842a1
·
1 Parent(s): 97c21a9

chore: Update Hugging Face model and add text anonymization endpoint

Browse files
Files changed (2) hide show
  1. app.py +27 -1
  2. requirements.txt +2 -0
app.py CHANGED
@@ -16,6 +16,7 @@ from transformers import pipeline
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from presidio_analyzer import AnalyzerEngine
18
  from presidio_anonymizer import AnonymizerEngine
 
19
 
20
  from pydantic import BaseModel
21
  from fastapi import FastAPI, Request, UploadFile, File
@@ -27,7 +28,9 @@ app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
27
  # genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
28
  # model = genai.GenerativeModel('gemini-pro')
29
  HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
30
- # pipe = pipeline("fill-mask", model="pranavraj1103/ksp-mask-model")
 
 
31
 
32
  app.add_middleware(
33
  CORSMiddleware,
@@ -207,3 +210,26 @@ async def presidio_mask(text: TextItem):
207
  seen_set.add((rec.start, rec.end))
208
  return return_list
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from presidio_analyzer import AnalyzerEngine
18
  from presidio_anonymizer import AnonymizerEngine
19
+ from huggingface_hub import login
20
 
21
  from pydantic import BaseModel
22
  from fastapi import FastAPI, Request, UploadFile, File
 
28
  # genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
29
  # model = genai.GenerativeModel('gemini-pro')
30
  HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
31
+ # login(HUGGINGFACE_KEY)
32
+
33
+ pipe = pipeline("fill-mask", model="./ksp-mask-model")
34
 
35
  app.add_middleware(
36
  CORSMiddleware,
 
210
  seen_set.add((rec.start, rec.end))
211
  return return_list
212
 
213
+
214
+ @app.post("/anonymize_text")
215
+ async def anonymize_text(text: TextItem):
216
+ off_set = 0
217
+ mask_list = await presidio_mask(text)
218
+ mask_list = sorted(mask_list, key=lambda x: x["start"])
219
+ new_mask_list = []
220
+ text = text.text
221
+ anonymized_text = text
222
+ final_text = text
223
+ for mask in mask_list:
224
+ mask_text = anonymized_text[:mask["start"]] + "<mask>" + anonymized_text[mask["end"]:]
225
+ options = pipe(mask_text)
226
+ final_text = final_text[:mask["start"] + off_set] + options[0]["token_str"] + final_text[mask["end"] + off_set:]
227
+ new_mask_list.append({
228
+ "start": mask["start"] + off_set,
229
+ "end": mask["start"] + off_set + len(options[0]["token_str"]),
230
+ "entity_type": mask["entity_type"],
231
+ "options": options,
232
+ "original_text": mask["text"],
233
+ })
234
+ off_set += len(options[0]["token_str"]) - len(mask["text"])
235
+ return {"anonymized_text": final_text, "mask_list": new_mask_list}
requirements.txt CHANGED
@@ -5,6 +5,8 @@ presidio-anonymizer==2.*
5
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
6
  requests
7
  uvicorn
 
 
8
  transformers
9
  python-dotenv
10
  python-multipart
 
5
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
6
  requests
7
  uvicorn
8
+ tensorflow
9
+ tf-keras
10
  transformers
11
  python-dotenv
12
  python-multipart