cordwainersmith commited on
Commit
98a427a
1 Parent(s): 5584918
Files changed (2) hide show
  1. app.py +26 -19
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  from transformers import AutoTokenizer, AutoModelForTokenClassification
3
  import time
4
  import json
@@ -34,10 +35,10 @@ EXAMPLE_SENTENCES = [
34
  ]
35
 
36
  MODEL_DETAILS = {
37
- "name": "GolemPII - Hebrew PII Detection Model CordwainerSmith/GolemPII-v7-full",
38
- "description": "This on-premise PII model is designed to automatically identify and mask sensitive information (PII) within Hebrew text data. It has been trained to recognize a wide range of PII entities, including names, addresses, phone numbers, financial information, and more.",
39
- "base_model": "microsoft/mdeberta-v3-base",
40
- "training_data": "Custom Hebrew PII dataset (size not specified)",
41
  "detected_pii_entities": [
42
  "FIRST_NAME",
43
  "LAST_NAME",
@@ -52,13 +53,16 @@ MODEL_DETAILS = {
52
  "DATE",
53
  "POSTAL_CODE",
54
  ],
 
 
 
 
 
 
 
55
  "training_details": {
56
- "Training epochs": "5",
57
- "Batch size": "32",
58
- "Learning rate": "5e-5",
59
- "Weight decay": "0.01",
60
- "Training speed": "~2.19 it/s",
61
- "Total training time": "2:08:26",
62
  },
63
  }
64
 
@@ -66,13 +70,16 @@ MODEL_DETAILS = {
66
  class PIIMaskingModel:
67
  def __init__(self, model_name: str):
68
  self.model_name = model_name
69
- hf_token = st.secrets["hf_token"] # Retrieve the token from secrets
70
  self.tokenizer = AutoTokenizer.from_pretrained(
71
  model_name, use_auth_token=hf_token
72
  )
73
  self.model = AutoModelForTokenClassification.from_pretrained(
74
  model_name, use_auth_token=hf_token
75
  )
 
 
 
76
 
77
  def process_text(
78
  self, text: str
@@ -83,23 +90,23 @@ class PIIMaskingModel:
83
  text,
84
  truncation=True,
85
  padding=False,
86
- return_tensors="np", # Return NumPy arrays for CPU
87
  return_offsets_mapping=True,
88
  add_special_tokens=True,
89
  )
90
 
91
- input_ids = tokenized_inputs.input_ids
92
- attention_mask = tokenized_inputs.attention_mask
93
  offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
94
 
95
  # Handle special tokens
96
  offset_mapping[0] = None # <s> token
97
  offset_mapping[-1] = None # </s> token
98
 
99
- # No need for torch.no_grad() as we are not using gradients
100
- outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
101
 
102
- predictions = outputs.logits.argmax(dim=-1) # No need to move to CPU
103
  predicted_labels = [
104
  self.model.config.id2label[label_id] for label_id in predictions[0]
105
  ]
@@ -139,7 +146,7 @@ class PIIMaskingModel:
139
  next_label = labels[j]
140
 
141
  # Stop if we hit a new B- tag (except for non-spaced tokens)
142
- if next_label.startswith("B-") and tokens[j].startswith(" "):
143
  break
144
 
145
  # Stop if we hit a different entity type in I- tags
@@ -151,7 +158,7 @@ class PIIMaskingModel:
151
  last_valid_end = offset_mapping[j][1]
152
  j += 1
153
  # Continue if it's a non-spaced B- token
154
- elif next_label.startswith("B-") and not tokens[j].startswith(" "):
155
  last_valid_end = offset_mapping[j][1]
156
  j += 1
157
  else:
 
1
  import streamlit as st
2
+ import torch
3
  from transformers import AutoTokenizer, AutoModelForTokenClassification
4
  import time
5
  import json
 
35
  ]
36
 
37
  MODEL_DETAILS = {
38
+ "name": "GolemPII-xlm-roberta-v1 - Hebrew PII Detection Model",
39
+ "description": "This model is specifically designed to identify and categorize Personally Identifiable Information (PII) within Hebrew text. It leverages the powerful XLM-RoBERTa base, fine-tuned with a curated Hebrew PII dataset, making it adept at token classification tasks tailored for Hebrew.",
40
+ "base_model": "xlm-roberta-base",
41
+ "training_data": "Custom Hebrew PII dataset",
42
  "detected_pii_entities": [
43
  "FIRST_NAME",
44
  "LAST_NAME",
 
53
  "DATE",
54
  "POSTAL_CODE",
55
  ],
56
+ "performance_metrics": {
57
+ "Loss": 0.000729,
58
+ "Precision": 0.9982,
59
+ "Recall": 0.9982,
60
+ "F1-Score": 0.9982,
61
+ "Accuracy": 0.999795,
62
+ },
63
  "training_details": {
64
+ "Training language": "Hebrew",
65
+ # Add other relevant training details if available
 
 
 
 
66
  },
67
  }
68
 
 
70
  class PIIMaskingModel:
71
  def __init__(self, model_name: str):
72
  self.model_name = model_name
73
+ hf_token = st.secrets["hf_token"]
74
  self.tokenizer = AutoTokenizer.from_pretrained(
75
  model_name, use_auth_token=hf_token
76
  )
77
  self.model = AutoModelForTokenClassification.from_pretrained(
78
  model_name, use_auth_token=hf_token
79
  )
80
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
+ self.model.to(self.device)
82
+ self.model.eval()
83
 
84
  def process_text(
85
  self, text: str
 
90
  text,
91
  truncation=True,
92
  padding=False,
93
+ return_tensors="pt",
94
  return_offsets_mapping=True,
95
  add_special_tokens=True,
96
  )
97
 
98
+ input_ids = tokenized_inputs.input_ids.to(self.device)
99
+ attention_mask = tokenized_inputs.attention_mask.to(self.device)
100
  offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
101
 
102
  # Handle special tokens
103
  offset_mapping[0] = None # <s> token
104
  offset_mapping[-1] = None # </s> token
105
 
106
+ with torch.no_grad():
107
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
108
 
109
+ predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
110
  predicted_labels = [
111
  self.model.config.id2label[label_id] for label_id in predictions[0]
112
  ]
 
146
  next_label = labels[j]
147
 
148
  # Stop if we hit a new B- tag (except for non-spaced tokens)
149
+ if next_label.startswith("B-") and tokens[j].startswith(""):
150
  break
151
 
152
  # Stop if we hit a different entity type in I- tags
 
158
  last_valid_end = offset_mapping[j][1]
159
  j += 1
160
  # Continue if it's a non-spaced B- token
161
+ elif next_label.startswith("B-") and not tokens[j].startswith(""):
162
  last_valid_end = offset_mapping[j][1]
163
  j += 1
164
  else:
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  streamlit
2
- transformers
 
 
1
  streamlit
2
+ transformers
3
+ torch