aliasgerovs commited on
Commit
79b97e2
1 Parent(s): 1797f70
Files changed (4) hide show
  1. app.py +8 -2
  2. predictors.py +55 -0
  3. requirements.txt +3 -1
  4. utils.py +20 -5
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import numpy as np
3
  from datetime import date
4
  from predictors import predict_bc_scores, predict_mc_scores
 
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
  from plagiarism import plagiarism_check, build_date
@@ -112,6 +113,12 @@ with gr.Blocks() as demo:
112
  char_count = gr.Textbox(label="Minumum Character Limit Check")
113
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
114
 
 
 
 
 
 
 
115
  with gr.Row():
116
  models = gr.Dropdown(
117
  model_list,
@@ -310,6 +317,5 @@ with gr.Blocks() as demo:
310
  date_from = ""
311
  date_to = ""
312
 
313
-
314
  if __name__ == "__main__":
315
- demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
 
2
  import numpy as np
3
  from datetime import date
4
  from predictors import predict_bc_scores, predict_mc_scores
5
+ from predictors import update, correct_text, split_text
6
  from analysis import depth_analysis
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date
 
113
  char_count = gr.Textbox(label="Minumum Character Limit Check")
114
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
115
 
116
+ with gr.Row():
117
+ btn = gr.Button("Bias Buster")
118
+ out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
119
+ corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
120
+ btn.click(fn=update, inputs=input_text, outputs=[out, corrections_output])
121
+
122
  with gr.Row():
123
  models = gr.Dropdown(
124
  model_list,
 
317
  date_from = ""
318
  date_to = ""
319
 
 
320
  if __name__ == "__main__":
321
+ demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
predictors.py CHANGED
@@ -21,6 +21,15 @@ import os
21
  from utils import *
22
  import joblib
23
  from optimum.bettertransformer import BetterTransformer
 
 
 
 
 
 
 
 
 
24
 
25
  with open("config.yaml", "r") as file:
26
  params = yaml.safe_load(file)
@@ -37,6 +46,8 @@ mc_label_map = params["MC_OUTPUT_LABELS"]
37
  text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
38
  mc_token_size = int(params["MC_TOKEN_SIZE"])
39
  bc_token_size = int(params["BC_TOKEN_SIZE"])
 
 
40
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
41
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(
42
  text_bc_model_path
@@ -57,6 +68,21 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
57
  model
58
  ).to(device)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # proxy models for explainability
61
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
62
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
@@ -79,6 +105,34 @@ quillbot_model = BetterTransformer.transform(quillbot_model)
79
  iso_reg = joblib.load("isotonic_regression_model.joblib")
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def split_text_allow_complete_sentences_nltk(
83
  text,
84
  max_length=256,
@@ -252,6 +306,7 @@ def predict_bc_scores(input):
252
  human_score = 1 - ai_score
253
  bc_score = {"AI": ai_score, "HUMAN": human_score}
254
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
 
255
  return bc_score
256
 
257
 
 
21
  from utils import *
22
  import joblib
23
  from optimum.bettertransformer import BetterTransformer
24
+ import gc
25
+ from cleantext import clean
26
+ import gradio as gr
27
+ from tqdm.auto import tqdm
28
+ from transformers import pipeline
29
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
30
+ import nltk
31
+ from nltk.tokenize import sent_tokenize
32
+ from optimum.pipelines import pipeline
33
 
34
  with open("config.yaml", "r") as file:
35
  params = yaml.safe_load(file)
 
46
  text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
47
  mc_token_size = int(params["MC_TOKEN_SIZE"])
48
  bc_token_size = int(params["BC_TOKEN_SIZE"])
49
+ bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
50
+ bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
51
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
52
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(
53
  text_bc_model_path
 
68
  model
69
  ).to(device)
70
 
71
+
72
+ bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
73
+ tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
74
+ bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
75
+ bias_checker = pipeline(
76
+ "text-classification",
77
+ model=model,
78
+ tokenizer=tokenizer,
79
+ )
80
+ gc.collect()
81
+ bias_corrector = pipeline(
82
+ "text2text-generation", model=bias_corrector_model_name, accelerator="ort"
83
+
84
+ )
85
+
86
  # proxy models for explainability
87
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
88
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
 
105
  iso_reg = joblib.load("isotonic_regression_model.joblib")
106
 
107
 
108
+ def split_text(text: str) -> list:
109
+ sentences = sent_tokenize(text)
110
+ return [[sentence] for sentence in sentences]
111
+
112
+ def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
113
+ sentence_batches = split_text(text)
114
+ corrected_text = []
115
+ corrections = []
116
+ for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
117
+ raw_text = " ".join(batch)
118
+ results = bias_checker(raw_text)
119
+ if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
120
+ corrected_batch = bias_corrector(raw_text)
121
+ corrected_version = corrected_batch[0]["generated_text"]
122
+ corrected_text.append(corrected_version)
123
+ corrections.append((raw_text, corrected_version))
124
+ else:
125
+ corrected_text.append(raw_text)
126
+ corrected_text = separator.join(corrected_text)
127
+ return corrected_text, corrections
128
+
129
+ def update(text: str):
130
+ text = clean(text, lower=False)
131
+ corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
132
+ corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
133
+ return corrected_text, corrections_display
134
+
135
+
136
  def split_text_allow_complete_sentences_nltk(
137
  text,
138
  max_length=256,
 
306
  human_score = 1 - ai_score
307
  bc_score = {"AI": ai_score, "HUMAN": human_score}
308
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
309
+ print(f"Input Text: {cleaned_text_bc}")
310
  return bc_score
311
 
312
 
requirements.txt CHANGED
@@ -26,4 +26,6 @@ Unidecode
26
  python-dotenv
27
  lime
28
  joblib
29
- optimum
 
 
 
26
  python-dotenv
27
  lime
28
  joblib
29
+ optimum
30
+ clean-text
31
+ optimum[onnxruntime]
utils.py CHANGED
@@ -31,13 +31,28 @@ def remove_accents(input_str):
31
 
32
 
33
  def remove_special_characters(text):
34
- text = text.replace("<s>", "").replace("</s>", "")
35
- text = remove_accents(text)
36
- pattern = r'[^\w\s\d.,!?\'"()-;]+'
37
- text = re.sub(pattern, "", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return text
39
 
40
-
41
  def remove_special_characters_2(text):
42
  pattern = r"[^a-zA-Z0-9 ]+"
43
  text = re.sub(pattern, "", text)
 
31
 
32
 
33
  def remove_special_characters(text):
34
+ text = re.sub(r'https?://\S+|www\.\S+', '', text)
35
+ emoji_pattern = re.compile("["
36
+ u"\U0001F600-\U0001F64F" # emoticons
37
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
38
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
39
+ u"\U0001F700-\U0001F77F" # alchemical symbols
40
+ u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
41
+ u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
42
+ u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
43
+ u"\U0001FA00-\U0001FA6F" # Chess Symbols
44
+ u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
45
+ u"\U00002702-\U000027B0" # Dingbats
46
+ u"\U000024C2-\U0001F251"
47
+ "]+", flags=re.UNICODE)
48
+ text = emoji_pattern.sub('', text)
49
+ text = re.sub(r'#\w+', '', text)
50
+ text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
51
+ text = re.sub(r'\s+([.,!?;])', r'\1', text)
52
+ text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
  return text
55
 
 
56
  def remove_special_characters_2(text):
57
  pattern = r"[^a-zA-Z0-9 ]+"
58
  text = re.sub(pattern, "", text)