Spaces:
Running
Running
aliasgerovs
commited on
Commit
·
dd9b08a
1
Parent(s):
ca39c04
Added latest updates related to higlighter fix
Browse files- app.py +3 -3
- highlighter.py +19 -11
- predictors.py +47 -116
app.py
CHANGED
@@ -6,7 +6,7 @@ from predictors import update,update_main, correct_text, split_text
|
|
6 |
from analysis import depth_analysis
|
7 |
from predictors import predict_quillbot
|
8 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
9 |
-
from highlighter import
|
10 |
from utils import extract_text_from_pdf, len_validator
|
11 |
import yaml
|
12 |
from functools import partial
|
@@ -20,9 +20,9 @@ with open("config.yaml", "r") as file:
|
|
20 |
model_list = params["MC_OUTPUT_LABELS"]
|
21 |
|
22 |
|
23 |
-
analyze_and_highlight_bc = partial(
|
24 |
analyze_and_highlight_quillbot = partial(
|
25 |
-
|
26 |
)
|
27 |
|
28 |
|
|
|
6 |
from analysis import depth_analysis
|
7 |
from predictors import predict_quillbot
|
8 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
9 |
+
from highlighter import segmented_higlighter
|
10 |
from utils import extract_text_from_pdf, len_validator
|
11 |
import yaml
|
12 |
from functools import partial
|
|
|
20 |
model_list = params["MC_OUTPUT_LABELS"]
|
21 |
|
22 |
|
23 |
+
analyze_and_highlight_bc = partial(segmented_higlighter, model_type="bc")
|
24 |
analyze_and_highlight_quillbot = partial(
|
25 |
+
segmented_higlighter, model_type="quillbot"
|
26 |
)
|
27 |
|
28 |
|
highlighter.py
CHANGED
@@ -2,7 +2,7 @@ from lime.lime_text import LimeTextExplainer
|
|
2 |
from nltk.tokenize import sent_tokenize
|
3 |
from predictors import predict_for_explainanility
|
4 |
from predictors import update, correct_text, split_text
|
5 |
-
|
6 |
|
7 |
def explainer(text, model_type):
|
8 |
def predictor_wrapper(text):
|
@@ -15,7 +15,7 @@ def explainer(text, model_type):
|
|
15 |
sentences = [sent for sent in sent_tokenize(text)]
|
16 |
num_sentences = len(sentences)
|
17 |
exp = explainer_.explain_instance(
|
18 |
-
text, predictor_wrapper, num_features=num_sentences, num_samples=
|
19 |
)
|
20 |
weights_mapping = exp.as_map()[1]
|
21 |
sentences_weights = {sentence: 0 for sentence in sentences}
|
@@ -23,15 +23,12 @@ def explainer(text, model_type):
|
|
23 |
if 0 <= idx < len(sentences):
|
24 |
sentences_weights[sentences[idx]] = weight
|
25 |
print(sentences_weights, model_type)
|
26 |
-
return sentences_weights, exp
|
27 |
|
28 |
|
29 |
def analyze_and_highlight(text, bias_buster_selected, model_type):
|
30 |
-
if bias_buster_selected:
|
31 |
-
text = update(text)
|
32 |
-
|
33 |
highlighted_text = ""
|
34 |
-
sentences_weights, _ = explainer(text, model_type)
|
35 |
positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
|
36 |
negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
|
37 |
|
@@ -44,7 +41,8 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
|
|
44 |
max_positive_weight += smoothing_factor
|
45 |
min_negative_weight -= smoothing_factor
|
46 |
|
47 |
-
for sentence
|
|
|
48 |
sentence = sentence.strip()
|
49 |
if not sentence:
|
50 |
continue
|
@@ -67,6 +65,17 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
|
|
67 |
)
|
68 |
highlighted_text += highlighted_sentence
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
if model_type == "bc":
|
71 |
gradient_labels = ["HUMAN", "AI"]
|
72 |
elif model_type == "quillbot":
|
@@ -76,7 +85,7 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
|
|
76 |
|
77 |
highlighted_text = (
|
78 |
"<div>"
|
79 |
-
+
|
80 |
+ "<div style='margin-top: 20px; text-align: center;'>"
|
81 |
+ "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
|
82 |
+ f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
|
@@ -85,5 +94,4 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
|
|
85 |
+ "</div>"
|
86 |
+ "</div>"
|
87 |
)
|
88 |
-
|
89 |
-
return highlighted_text
|
|
|
2 |
from nltk.tokenize import sent_tokenize
|
3 |
from predictors import predict_for_explainanility
|
4 |
from predictors import update, correct_text, split_text
|
5 |
+
from predictors import split_text_allow_complete_sentences_nltk, get_token_length
|
6 |
|
7 |
def explainer(text, model_type):
|
8 |
def predictor_wrapper(text):
|
|
|
15 |
sentences = [sent for sent in sent_tokenize(text)]
|
16 |
num_sentences = len(sentences)
|
17 |
exp = explainer_.explain_instance(
|
18 |
+
text, predictor_wrapper, num_features=num_sentences, num_samples=100
|
19 |
)
|
20 |
weights_mapping = exp.as_map()[1]
|
21 |
sentences_weights = {sentence: 0 for sentence in sentences}
|
|
|
23 |
if 0 <= idx < len(sentences):
|
24 |
sentences_weights[sentences[idx]] = weight
|
25 |
print(sentences_weights, model_type)
|
26 |
+
return sentences_weights, sentences, exp
|
27 |
|
28 |
|
29 |
def analyze_and_highlight(text, bias_buster_selected, model_type):
|
|
|
|
|
|
|
30 |
highlighted_text = ""
|
31 |
+
sentences_weights, sentences, _ = explainer(text, model_type)
|
32 |
positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
|
33 |
negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
|
34 |
|
|
|
41 |
max_positive_weight += smoothing_factor
|
42 |
min_negative_weight -= smoothing_factor
|
43 |
|
44 |
+
for sentence in sentences:
|
45 |
+
weight = sentences_weights[sentence]
|
46 |
sentence = sentence.strip()
|
47 |
if not sentence:
|
48 |
continue
|
|
|
65 |
)
|
66 |
highlighted_text += highlighted_sentence
|
67 |
|
68 |
+
return highlighted_text
|
69 |
+
|
70 |
+
def segmented_higlighter(text, bias_buster_selected, model_type):
|
71 |
+
if bias_buster_selected:
|
72 |
+
text = update(text)
|
73 |
+
result = ""
|
74 |
+
segmented_results = split_text_allow_complete_sentences_nltk(text)
|
75 |
+
for segment in segmented_results:
|
76 |
+
chunk = analyze_and_highlight(segment, model_type)
|
77 |
+
result = result + " " + chunk
|
78 |
+
print(result)
|
79 |
if model_type == "bc":
|
80 |
gradient_labels = ["HUMAN", "AI"]
|
81 |
elif model_type == "quillbot":
|
|
|
85 |
|
86 |
highlighted_text = (
|
87 |
"<div>"
|
88 |
+
+ result
|
89 |
+ "<div style='margin-top: 20px; text-align: center;'>"
|
90 |
+ "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
|
91 |
+ f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
|
|
|
94 |
+ "</div>"
|
95 |
+ "</div>"
|
96 |
)
|
97 |
+
return highlighted_text
|
|
predictors.py
CHANGED
@@ -24,7 +24,6 @@ with open("config.yaml", "r") as file:
|
|
24 |
nltk.download("punkt")
|
25 |
nltk.download("stopwords")
|
26 |
device_needed = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
-
device = 'cpu'
|
28 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
29 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
30 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
@@ -50,12 +49,12 @@ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
|
50 |
|
51 |
|
52 |
# proxy models for explainability
|
53 |
-
mini_bc_model_name = "polygraf-ai/bc-model
|
54 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
55 |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
56 |
mini_bc_model_name
|
57 |
).to(device_needed)
|
58 |
-
mini_humanizer_model_name =
|
59 |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
|
60 |
mini_humanizer_model_name
|
61 |
)
|
@@ -119,83 +118,58 @@ def update_main(text: str):
|
|
119 |
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
|
120 |
return corrected_text, corrections_display
|
121 |
|
122 |
-
def
|
123 |
-
text
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
132 |
current_length = 0
|
133 |
if type_det == "bc":
|
134 |
tokenizer = text_bc_tokenizer
|
135 |
-
|
136 |
elif type_det == "mc":
|
137 |
tokenizer = text_mc_tokenizer
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
current_segment = [sentence]
|
156 |
-
current_length = sentence_length
|
157 |
-
|
158 |
-
if current_segment:
|
159 |
-
encoded_segment = tokenizer.encode(
|
160 |
-
" ".join(current_segment),
|
161 |
-
add_special_tokens=True,
|
162 |
-
max_length=max_length + tolerance,
|
163 |
-
truncation=True,
|
164 |
-
)
|
165 |
-
segments.append((current_segment, len(encoded_segment)))
|
166 |
-
|
167 |
-
final_segments = []
|
168 |
-
for i, (seg, length) in enumerate(segments):
|
169 |
-
if i == len(segments) - 1:
|
170 |
-
if length < min_last_segment_length and len(final_segments) > 0:
|
171 |
-
prev_seg, prev_length = final_segments[-1]
|
172 |
-
combined_encoded = tokenizer.encode(
|
173 |
-
" ".join(prev_seg + seg),
|
174 |
-
add_special_tokens=True,
|
175 |
-
max_length=max_length + tolerance,
|
176 |
-
truncation=True,
|
177 |
-
)
|
178 |
-
if len(combined_encoded) <= max_length + tolerance:
|
179 |
-
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
180 |
-
else:
|
181 |
-
final_segments.append((seg, length))
|
182 |
else:
|
183 |
-
|
|
|
184 |
else:
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
encoded_segments = []
|
189 |
-
for seg, _ in final_segments:
|
190 |
-
encoded_segment = tokenizer.encode(
|
191 |
-
" ".join(seg),
|
192 |
-
add_special_tokens=True,
|
193 |
-
max_length=max_length + tolerance,
|
194 |
-
truncation=True,
|
195 |
-
)
|
196 |
-
decoded_segment = tokenizer.decode(encoded_segment)
|
197 |
-
decoded_segments.append(decoded_segment)
|
198 |
-
return decoded_segments
|
199 |
|
200 |
|
201 |
def predict_quillbot(text, bias_buster_selected):
|
@@ -227,7 +201,7 @@ def predict_for_explainanility(text, model_type=None):
|
|
227 |
tokenizer = humanizer_tokenizer_mini
|
228 |
elif model_type == "bc":
|
229 |
cleaning = True
|
230 |
-
max_length =
|
231 |
model = bc_model_mini
|
232 |
tokenizer = bc_tokenizer_mini
|
233 |
else:
|
@@ -278,46 +252,6 @@ def predict_mc(model, tokenizer, text):
|
|
278 |
return output_norm
|
279 |
|
280 |
|
281 |
-
def predict_mc_scores(input):
|
282 |
-
bc_scores = []
|
283 |
-
mc_scores = []
|
284 |
-
|
285 |
-
samples_len_bc = len(
|
286 |
-
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
287 |
-
)
|
288 |
-
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
289 |
-
for i in range(samples_len_bc):
|
290 |
-
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
291 |
-
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
292 |
-
bc_scores.append(bc_score)
|
293 |
-
bc_scores_array = np.array(bc_scores)
|
294 |
-
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
295 |
-
bc_score_list = average_bc_scores.tolist()
|
296 |
-
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
297 |
-
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
298 |
-
samples_len_mc = len(
|
299 |
-
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
300 |
-
)
|
301 |
-
for i in range(samples_len_mc):
|
302 |
-
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
303 |
-
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
304 |
-
mc_scores.append(mc_score)
|
305 |
-
mc_scores_array = np.array(mc_scores)
|
306 |
-
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
307 |
-
mc_score_list = average_mc_scores.tolist()
|
308 |
-
mc_score = {}
|
309 |
-
for score, label in zip(mc_score_list, mc_label_map):
|
310 |
-
mc_score[label.upper()] = score
|
311 |
-
|
312 |
-
sum_prob = 1 - bc_score["HUMAN"]
|
313 |
-
for key, value in mc_score.items():
|
314 |
-
mc_score[key] = value * sum_prob
|
315 |
-
if sum_prob < 0.01:
|
316 |
-
mc_score = {}
|
317 |
-
|
318 |
-
return mc_score
|
319 |
-
|
320 |
-
|
321 |
def predict_bc_scores(input):
|
322 |
bc_scores = []
|
323 |
samples_len_bc = len(
|
@@ -385,9 +319,6 @@ def predict_mc_scores(input):
|
|
385 |
for score, label in zip(mc_score_list, mc_label_map):
|
386 |
mc_score[label.upper()] = score
|
387 |
|
388 |
-
total = sum(mc_score.values())
|
389 |
-
# Normalize each value by dividing it by the total
|
390 |
-
mc_score = {key: value / total for key, value in mc_score.items()}
|
391 |
sum_prob = 1 - bc_score["HUMAN"]
|
392 |
for key, value in mc_score.items():
|
393 |
mc_score[key] = value * sum_prob
|
|
|
24 |
nltk.download("punkt")
|
25 |
nltk.download("stopwords")
|
26 |
device_needed = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
27 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
28 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
29 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
|
|
49 |
|
50 |
|
51 |
# proxy models for explainability
|
52 |
+
mini_bc_model_name = "polygraf-ai/bc-model"
|
53 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
54 |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
55 |
mini_bc_model_name
|
56 |
).to(device_needed)
|
57 |
+
mini_humanizer_model_name = "polygraf-ai/humanizer-model"
|
58 |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
|
59 |
mini_humanizer_model_name
|
60 |
)
|
|
|
118 |
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
|
119 |
return corrected_text, corrections_display
|
120 |
|
121 |
+
def split_text(text: str) -> list:
|
122 |
+
sentences = sent_tokenize(text)
|
123 |
+
return [[sentence] for sentence in sentences]
|
124 |
+
|
125 |
+
def get_token_length(tokenizer, sentence):
|
126 |
+
return len(tokenizer.tokenize(sentence))
|
127 |
+
|
128 |
+
def split_text_allow_complete_sentences_nltk(text, type_det="bc"):
|
129 |
+
sentences = sent_tokenize(text)
|
130 |
+
chunks = []
|
131 |
+
current_chunk = []
|
132 |
current_length = 0
|
133 |
if type_det == "bc":
|
134 |
tokenizer = text_bc_tokenizer
|
135 |
+
max_tokens = bc_token_size
|
136 |
elif type_det == "mc":
|
137 |
tokenizer = text_mc_tokenizer
|
138 |
+
max_tokens = mc_token_size
|
139 |
+
|
140 |
+
elif type_det == "quillbot":
|
141 |
+
tokenizer = quillbot_tokenizer
|
142 |
+
max_tokens = 256
|
143 |
+
|
144 |
+
def add_sentence_to_chunk(sentence):
|
145 |
+
nonlocal current_chunk, current_length
|
146 |
+
sentence_length = get_token_length(tokenizer, sentence)
|
147 |
+
if current_length + sentence_length > max_tokens:
|
148 |
+
chunks.append((current_chunk, current_length))
|
149 |
+
current_chunk = []
|
150 |
+
current_length = 0
|
151 |
+
current_chunk.append(sentence)
|
152 |
+
current_length += sentence_length
|
153 |
|
154 |
+
for sentence in sentences:
|
155 |
+
add_sentence_to_chunk(sentence)
|
156 |
+
if current_chunk:
|
157 |
+
chunks.append((current_chunk, current_length))
|
158 |
+
adjusted_chunks = []
|
159 |
+
while chunks:
|
160 |
+
chunk = chunks.pop(0)
|
161 |
+
if len(chunks) > 0 and chunk[1] < max_tokens / 2:
|
162 |
+
next_chunk = chunks.pop(0)
|
163 |
+
combined_length = chunk[1] + next_chunk[1]
|
164 |
+
if combined_length <= max_tokens:
|
165 |
+
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
else:
|
167 |
+
adjusted_chunks.append(chunk)
|
168 |
+
chunks.insert(0, next_chunk)
|
169 |
else:
|
170 |
+
adjusted_chunks.append(chunk)
|
171 |
+
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
|
172 |
+
return result_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
def predict_quillbot(text, bias_buster_selected):
|
|
|
201 |
tokenizer = humanizer_tokenizer_mini
|
202 |
elif model_type == "bc":
|
203 |
cleaning = True
|
204 |
+
max_length = bc_token_size
|
205 |
model = bc_model_mini
|
206 |
tokenizer = bc_tokenizer_mini
|
207 |
else:
|
|
|
252 |
return output_norm
|
253 |
|
254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
def predict_bc_scores(input):
|
256 |
bc_scores = []
|
257 |
samples_len_bc = len(
|
|
|
319 |
for score, label in zip(mc_score_list, mc_label_map):
|
320 |
mc_score[label.upper()] = score
|
321 |
|
|
|
|
|
|
|
322 |
sum_prob = 1 - bc_score["HUMAN"]
|
323 |
for key, value in mc_score.items():
|
324 |
mc_score[key] = value * sum_prob
|