cordwainersmith commited on
Commit
71b342f
1 Parent(s): 277ab09

Add application file

Browse files
Files changed (1) hide show
  1. app.py +11 -16
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- import torch
3
  from transformers import AutoTokenizer, AutoModelForTokenClassification
4
  import time
5
  import json
@@ -67,13 +66,9 @@ MODEL_DETAILS = {
67
  class PIIMaskingModel:
68
  def __init__(self, model_name: str):
69
  self.model_name = model_name
70
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
71
- self.model = AutoModelForTokenClassification.from_pretrained(
72
- model_name, token=HF_TOKEN
73
- )
74
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
- self.model.to(self.device)
76
- self.model.eval()
77
 
78
  def process_text(
79
  self, text: str
@@ -84,23 +79,23 @@ class PIIMaskingModel:
84
  text,
85
  truncation=True,
86
  padding=False,
87
- return_tensors="pt",
88
  return_offsets_mapping=True,
89
  add_special_tokens=True,
90
  )
91
 
92
- input_ids = tokenized_inputs.input_ids.to(self.device)
93
- attention_mask = tokenized_inputs.attention_mask.to(self.device)
94
  offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
95
 
96
  # Handle special tokens
97
  offset_mapping[0] = None # <s> token
98
  offset_mapping[-1] = None # </s> token
99
 
100
- with torch.no_grad():
101
- outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
102
 
103
- predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
104
  predicted_labels = [
105
  self.model.config.id2label[label_id] for label_id in predictions[0]
106
  ]
@@ -140,7 +135,7 @@ class PIIMaskingModel:
140
  next_label = labels[j]
141
 
142
  # Stop if we hit a new B- tag (except for non-spaced tokens)
143
- if next_label.startswith("B-") and tokens[j].startswith(""):
144
  break
145
 
146
  # Stop if we hit a different entity type in I- tags
@@ -152,7 +147,7 @@ class PIIMaskingModel:
152
  last_valid_end = offset_mapping[j][1]
153
  j += 1
154
  # Continue if it's a non-spaced B- token
155
- elif next_label.startswith("B-") and not tokens[j].startswith(""):
156
  last_valid_end = offset_mapping[j][1]
157
  j += 1
158
  else:
 
1
  import streamlit as st
 
2
  from transformers import AutoTokenizer, AutoModelForTokenClassification
3
  import time
4
  import json
 
66
  class PIIMaskingModel:
67
  def __init__(self, model_name: str):
68
  self.model_name = model_name
69
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
70
+ self.model = AutoModelForTokenClassification.from_pretrained(model_name)
71
+ # No need to specify device as we are forcing CPU usage
 
 
 
 
72
 
73
  def process_text(
74
  self, text: str
 
79
  text,
80
  truncation=True,
81
  padding=False,
82
+ return_tensors="np", # Return NumPy arrays for CPU
83
  return_offsets_mapping=True,
84
  add_special_tokens=True,
85
  )
86
 
87
+ input_ids = tokenized_inputs.input_ids
88
+ attention_mask = tokenized_inputs.attention_mask
89
  offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
90
 
91
  # Handle special tokens
92
  offset_mapping[0] = None # <s> token
93
  offset_mapping[-1] = None # </s> token
94
 
95
+ # No need for torch.no_grad() as we are not using gradients
96
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
97
 
98
+ predictions = outputs.logits.argmax(dim=-1) # No need to move to CPU
99
  predicted_labels = [
100
  self.model.config.id2label[label_id] for label_id in predictions[0]
101
  ]
 
135
  next_label = labels[j]
136
 
137
  # Stop if we hit a new B- tag (except for non-spaced tokens)
138
+ if next_label.startswith("B-") and tokens[j].startswith(" "):
139
  break
140
 
141
  # Stop if we hit a different entity type in I- tags
 
147
  last_valid_end = offset_mapping[j][1]
148
  j += 1
149
  # Continue if it's a non-spaced B- token
150
+ elif next_label.startswith("B-") and not tokens[j].startswith(" "):
151
  last_valid_end = offset_mapping[j][1]
152
  j += 1
153
  else: