aliasgerovs commited on
Commit
3640695
·
2 Parent(s): a1474e2 9c75413

Merge branch 'main' into demo

Browse files
Files changed (7) hide show
  1. analysis.py +27 -9
  2. app.py +11 -0
  3. isotonic_regression_model.joblib +0 -0
  4. plagiarism.py +165 -190
  5. predictors.py +76 -42
  6. requirements.txt +1 -1
  7. utils.py +2 -22
analysis.py CHANGED
@@ -62,7 +62,10 @@ def depth_analysis(input_text):
62
  "punctuation_diversity": (-0.21875, 0.53125),
63
  "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
64
  "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
65
- "calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
 
 
 
66
  "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
67
  "mtld": (-84.03125000000001, 248.81875000000002),
68
  }
@@ -72,14 +75,17 @@ def depth_analysis(input_text):
72
  determiner_use = determiners_frequency(input_text, nlp)
73
  punctuation_variety = punctuation_diversity(input_text)
74
  sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
75
- perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
 
 
76
  lexical_diversity = type_token_ratio(input_text)
77
  unique_words = hapax_legomena_ratio(input_text)
78
  vocabulary_stability = mtld(input_text)
79
 
80
  # normalize between 0 and 100
81
  vocabulary_level_norm = normalize(
82
- vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
 
83
  )
84
  entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
85
  determiner_use_norm = normalize(
@@ -91,12 +97,18 @@ def depth_analysis(input_text):
91
  lexical_diversity_norm = normalize(
92
  lexical_diversity, *usual_ranges["type_token_ratio"]
93
  )
94
- unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
95
- vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
 
 
 
 
96
  sentence_depth_norm = normalize(
97
  sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
98
  )
99
- perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
 
 
100
 
101
  features = {
102
  "Lexical Diversity": lexical_diversity_norm,
@@ -161,7 +173,8 @@ def depth_analysis(input_text):
161
  path=Path.unit_regular_polygon(num_vars),
162
  )
163
  spine.set_transform(
164
- Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
 
165
  )
166
  return {"polar": spine}
167
 
@@ -172,14 +185,19 @@ def depth_analysis(input_text):
172
  theta = radar_factory(N, frame="polygon")
173
  data = features.values()
174
  labels = features.keys()
175
- fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
 
 
176
  ax.plot(theta, data)
177
  ax.fill(theta, data, alpha=0.4)
178
  ax.set_varlabels(labels)
179
 
180
  rgrids = np.linspace(0, 100, num=6)
181
  ax.set_rgrids(
182
- rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
 
 
 
183
  )
184
  ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
185
 
 
62
  "punctuation_diversity": (-0.21875, 0.53125),
63
  "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
64
  "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
65
+ "calculate_syntactic_tree_depth": (
66
+ 1.8380681818181812,
67
+ 10.997159090909092,
68
+ ),
69
  "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
70
  "mtld": (-84.03125000000001, 248.81875000000002),
71
  }
 
75
  determiner_use = determiners_frequency(input_text, nlp)
76
  punctuation_variety = punctuation_diversity(input_text)
77
  sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
78
+ perplexity = calculate_perplexity(
79
+ input_text, gpt2_model, gpt2_tokenizer, device
80
+ )
81
  lexical_diversity = type_token_ratio(input_text)
82
  unique_words = hapax_legomena_ratio(input_text)
83
  vocabulary_stability = mtld(input_text)
84
 
85
  # normalize between 0 and 100
86
  vocabulary_level_norm = normalize(
87
+ vocabulary_level,
88
+ *usual_ranges["estimated_slightly_difficult_words_ratio"],
89
  )
90
  entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
91
  determiner_use_norm = normalize(
 
97
  lexical_diversity_norm = normalize(
98
  lexical_diversity, *usual_ranges["type_token_ratio"]
99
  )
100
+ unique_words_norm = normalize(
101
+ unique_words, *usual_ranges["hapax_legomena_ratio"]
102
+ )
103
+ vocabulary_stability_norm = normalize(
104
+ vocabulary_stability, *usual_ranges["mtld"]
105
+ )
106
  sentence_depth_norm = normalize(
107
  sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
108
  )
109
+ perplexity_norm = normalize(
110
+ perplexity, *usual_ranges["calculate_perplexity"]
111
+ )
112
 
113
  features = {
114
  "Lexical Diversity": lexical_diversity_norm,
 
173
  path=Path.unit_regular_polygon(num_vars),
174
  )
175
  spine.set_transform(
176
+ Affine2D().scale(0.5).translate(0.5, 0.5)
177
+ + self.transAxes
178
  )
179
  return {"polar": spine}
180
 
 
185
  theta = radar_factory(N, frame="polygon")
186
  data = features.values()
187
  labels = features.keys()
188
+ fig, ax = plt.subplots(
189
+ subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
190
+ )
191
  ax.plot(theta, data)
192
  ax.fill(theta, data, alpha=0.4)
193
  ax.set_varlabels(labels)
194
 
195
  rgrids = np.linspace(0, 100, num=6)
196
  ax.set_rgrids(
197
+ rgrids,
198
+ labels=[f"{round(r)}%" for r in rgrids],
199
+ fontsize=8,
200
+ color="black",
201
  )
202
  ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
203
 
app.py CHANGED
@@ -47,6 +47,7 @@ def main(
47
  month_to,
48
  day_to,
49
  domains_to_skip,
 
50
  ):
51
 
52
  # formatted_tokens = plagiarism_check(
@@ -70,6 +71,7 @@ def main(
70
  month_to,
71
  day_to,
72
  domains_to_skip,
 
73
  )
74
  depth_analysis_plot = depth_analysis(input)
75
  bc_score = predict_bc_scores(input)
@@ -153,6 +155,13 @@ with gr.Blocks() as demo:
153
  plag_option = gr.Radio(
154
  ["Standard", "Advanced"], label="Choose an option please."
155
  )
 
 
 
 
 
 
 
156
 
157
  with gr.Row():
158
  with gr.Column():
@@ -307,6 +316,7 @@ with gr.Blocks() as demo:
307
  month_to,
308
  day_to,
309
  domains_to_skip,
 
310
  ],
311
  outputs=[
312
  bcLabel,
@@ -347,6 +357,7 @@ with gr.Blocks() as demo:
347
  month_to,
348
  day_to,
349
  domains_to_skip,
 
350
  ],
351
  outputs=[
352
  sentenceBreakdown,
 
47
  month_to,
48
  day_to,
49
  domains_to_skip,
50
+ source_block_size,
51
  ):
52
 
53
  # formatted_tokens = plagiarism_check(
 
71
  month_to,
72
  day_to,
73
  domains_to_skip,
74
+ source_block_size,
75
  )
76
  depth_analysis_plot = depth_analysis(input)
77
  bc_score = predict_bc_scores(input)
 
155
  plag_option = gr.Radio(
156
  ["Standard", "Advanced"], label="Choose an option please."
157
  )
158
+ with gr.Row():
159
+ source_block_size = gr.Dropdown(
160
+ choices=["1", "2", "3", "Paragraph"],
161
+ label="Source Check Granularity",
162
+ value="2",
163
+ interactive=True,
164
+ )
165
 
166
  with gr.Row():
167
  with gr.Column():
 
316
  month_to,
317
  day_to,
318
  domains_to_skip,
319
+ source_block_size,
320
  ],
321
  outputs=[
322
  bcLabel,
 
357
  month_to,
358
  day_to,
359
  domains_to_skip,
360
+ source_block_size,
361
  ],
362
  outputs=[
363
  sentenceBreakdown,
isotonic_regression_model.joblib CHANGED
Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ
 
plagiarism.py CHANGED
@@ -16,37 +16,36 @@ WORD = re.compile(r"\w+")
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
 
19
- # returns cosine similarity of two vectors
20
- # input: two vectors
21
- # output: integer between 0 and 1.
22
- def get_cosine(vec1, vec2):
23
- intersection = set(vec1.keys()) & set(vec2.keys())
24
-
25
- # calculating numerator
26
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
27
-
28
- # calculating denominator
29
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
30
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
31
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
 
32
 
33
- # checking for divide by zero
34
- if denominator == 0:
35
- return 0.0
36
- else:
37
- return float(numerator) / denominator
 
 
 
38
 
39
 
40
- # converts given text into a vector
41
  def text_to_vector(text):
42
- # uses the Regular expression above and gets all words
43
  words = WORD.findall(text)
44
- # returns a counter of all the words (count of number of occurences)
45
  return Counter(words)
46
 
47
 
48
- # returns cosine similarity of two words
49
- # uses: text_to_vector(text) and get_cosine(v1,v2)
50
  def cosineSim(text1, text2):
51
  vector1 = text_to_vector(text1)
52
  vector2 = text_to_vector(text2)
@@ -55,132 +54,61 @@ def cosineSim(text1, text2):
55
  return cosine
56
 
57
 
58
- def cos_sim_torch(embedding_1, embedding_2):
59
- return util.pytorch_cos_sim(embedding_1, embedding_2).item()
60
-
61
-
62
- def embed_text(text):
63
- return model.encode(text, convert_to_tensor=True)
64
-
65
-
66
- def sentence_similarity(text1, text2):
67
- embedding_1 = model.encode(text1, convert_to_tensor=True)
68
- embedding_2 = model.encode(text2, convert_to_tensor=True)
69
-
70
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
71
- return o.item()
72
 
73
 
74
- def google_search(
75
- plag_option,
76
- sentences,
77
- url_count,
78
- score_array,
79
- url_list,
80
- sorted_date,
81
- domains_to_skip,
82
- api_key,
83
- cse_id,
84
- **kwargs,
85
- ):
86
- service = build("customsearch", "v1", developerKey=api_key)
87
- for i, sentence in enumerate(sentences):
88
- results = (
89
- service.cse()
90
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
91
- .execute()
92
- )
93
- if "items" in results and len(results["items"]) > 0:
94
- for count, link in enumerate(results["items"]):
95
- # stop after 3 pages
96
- if count >= 3:
97
- break
98
- # # skip user selected domains
99
- # if any(
100
- # ("." + domain) in link["link"] for domain in domains_to_skip
101
- # ):
102
- # continue
103
- # clean up snippet of '...'
104
- snippet = link["snippet"]
105
- ind = snippet.find("...")
106
- if ind < 20 and ind > 9:
107
- snippet = snippet[ind + len("... ") :]
108
- ind = snippet.find("...")
109
- if ind > len(snippet) - 5:
110
- snippet = snippet[:ind]
111
-
112
- # update cosine similarity between snippet and given text
113
- url = link["link"]
114
- if url not in url_list:
115
- url_list.append(url)
116
- score_array.append([0] * len(sentences))
117
- url_count[url] = url_count[url] + 1 if url in url_count else 1
118
- if plag_option == "Standard":
119
- score_array[url_list.index(url)][i] = cosineSim(
120
- sentence, snippet
121
- )
122
  else:
123
- score_array[url_list.index(url)][i] = sentence_similarity(
124
- sentence, snippet
125
- )
126
- return url_count, score_array
127
 
128
 
129
- def split_sentence_blocks(text):
130
- two_sents = []
131
- for para in text.split("\n\n"):
132
- sents = sent_tokenize(para)
133
- for i in range(len(sents)):
134
- if (i % 2) == 0:
135
- two_sents.append(sents[i])
136
- else:
137
- two_sents[len(two_sents) - 1] += " " + sents[i]
138
- return two_sents
139
 
140
 
141
- months = {
142
- "January": "01",
143
- "February": "02",
144
- "March": "03",
145
- "April": "04",
146
- "May": "05",
147
- "June": "06",
148
- "July": "07",
149
- "August": "08",
150
- "September": "09",
151
- "October": "10",
152
- "November": "11",
153
- "December": "12",
154
- }
155
 
156
 
157
- def build_date(year=2024, month="March", day=1):
158
- return f"{year}{months[month]}{day}"
 
 
 
159
 
160
 
161
  async def get_url_data(url, client):
162
  try:
163
  r = await client.get(url)
164
- # print(r.status_code)
165
  if r.status_code == 200:
166
- # print("in")
167
  soup = BeautifulSoup(r.content, "html.parser")
168
  return soup
169
  except Exception:
170
  return None
171
 
172
 
173
- def remove_punc(text):
174
- res = re.sub(r"[^\w\s]", "", text)
175
- return res
176
-
177
-
178
- def split_ngrams(text, n):
179
- # return n-grams of size n
180
- words = text.split()
181
- return [words[i : i + n] for i in range(len(words) - n + 1)]
182
-
183
-
184
  async def parallel_scrap(urls):
185
  async with httpx.AsyncClient(timeout=30) as client:
186
  tasks = []
@@ -209,11 +137,6 @@ def process_with_multiprocessing(input_data):
209
  return scores
210
 
211
 
212
- def print2d(array):
213
- for row in array:
214
- print(row)
215
-
216
-
217
  def map_sentence_url(sentences, score_array):
218
  sentenceToMaxURL = [-1] * len(sentences)
219
  for j in range(len(sentences)):
@@ -234,65 +157,59 @@ def map_sentence_url(sentences, score_array):
234
  return sentenceToMaxURL
235
 
236
 
237
- def html_highlight(
238
  plag_option,
239
- input,
240
- year_from,
241
- month_from,
242
- day_from,
243
- year_to,
244
- month_to,
245
- day_to,
246
  domains_to_skip,
 
 
 
247
  ):
248
- sentence_scores, url_scores = plagiarism_check(
249
- plag_option,
250
- input,
251
- year_from,
252
- month_from,
253
- day_from,
254
- year_to,
255
- month_to,
256
- day_to,
257
- domains_to_skip,
258
- )
259
- color_map = [
260
- "#cf2323",
261
- "#eb9d59",
262
- "#c2ad36",
263
- "#e1ed72",
264
- "#c2db76",
265
- "#a2db76",
266
- ]
267
- font = "Roboto"
268
- html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
269
- prev_idx = None
270
- combined_sentence = ""
271
- for sentence, _, _, idx in sentence_scores:
272
- if idx != prev_idx and prev_idx is not None:
273
- color = color_map[prev_idx - 1]
274
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
275
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
276
- html_content += formatted_sentence
277
- combined_sentence = ""
278
- combined_sentence += " " + sentence
279
- prev_idx = idx
280
-
281
- if combined_sentence:
282
- color = color_map[prev_idx - 1]
283
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
284
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
285
- html_content += formatted_sentence
286
-
287
- html_content += "<hr>"
288
- for url, score, idx in url_scores:
289
- color = color_map[idx - 1]
290
- formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
291
- html_content += formatted_url
292
-
293
- html_content += "</div>"
294
 
295
- return html_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
 
298
  def plagiarism_check(
@@ -305,17 +222,20 @@ def plagiarism_check(
305
  month_to,
306
  day_to,
307
  domains_to_skip,
 
308
  ):
309
  api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
310
  api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
 
 
311
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
312
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
313
- # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
314
  cse_id = "851813e81162b4ed4"
315
 
316
  url_scores = []
317
  sentence_scores = []
318
- sentences = split_sentence_blocks(input)
319
  url_count = {}
320
  score_array = []
321
  url_list = []
@@ -384,3 +304,58 @@ def plagiarism_check(
384
  )
385
 
386
  return sentence_scores, url_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
 
19
+ months = {
20
+ "January": "01",
21
+ "February": "02",
22
+ "March": "03",
23
+ "April": "04",
24
+ "May": "05",
25
+ "June": "06",
26
+ "July": "07",
27
+ "August": "08",
28
+ "September": "09",
29
+ "October": "10",
30
+ "November": "11",
31
+ "December": "12",
32
+ }
33
 
34
+ color_map = [
35
+ "#cf2323",
36
+ "#eb9d59",
37
+ "#c2ad36",
38
+ "#e1ed72",
39
+ "#c2db76",
40
+ "#a2db76",
41
+ ]
42
 
43
 
 
44
  def text_to_vector(text):
 
45
  words = WORD.findall(text)
 
46
  return Counter(words)
47
 
48
 
 
 
49
  def cosineSim(text1, text2):
50
  vector1 = text_to_vector(text1)
51
  vector2 = text_to_vector(text2)
 
54
  return cosine
55
 
56
 
57
+ def get_cosine(vec1, vec2):
58
+ intersection = set(vec1.keys()) & set(vec2.keys())
59
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
60
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
61
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
62
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
63
+ if denominator == 0:
64
+ return 0.0
65
+ else:
66
+ return float(numerator) / denominator
 
 
 
 
67
 
68
 
69
+ def split_sentence_blocks(text, size):
70
+ if size == "Paragraph":
71
+ blocks = text.split("\n")
72
+ return blocks
73
+ else:
74
+ blocks = []
75
+ size = int(size)
76
+ for para in text.split("\n\n"):
77
+ sents = sent_tokenize(para)
78
+ for i in range(len(sents)):
79
+ if (i % size) == 0:
80
+ blocks.append(sents[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  else:
82
+ blocks[int(i / size)] += " " + sents[i]
83
+ return blocks
 
 
84
 
85
 
86
+ def build_date(year=2024, month="March", day=1):
87
+ return f"{year}{months[month]}{day}"
 
 
 
 
 
 
 
 
88
 
89
 
90
+ def split_ngrams(text, n):
91
+ words = text.split()
92
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
+ def sentence_similarity(text1, text2):
96
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
97
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
98
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
99
+ return o.item()
100
 
101
 
102
  async def get_url_data(url, client):
103
  try:
104
  r = await client.get(url)
 
105
  if r.status_code == 200:
 
106
  soup = BeautifulSoup(r.content, "html.parser")
107
  return soup
108
  except Exception:
109
  return None
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
112
  async def parallel_scrap(urls):
113
  async with httpx.AsyncClient(timeout=30) as client:
114
  tasks = []
 
137
  return scores
138
 
139
 
 
 
 
 
 
140
  def map_sentence_url(sentences, score_array):
141
  sentenceToMaxURL = [-1] * len(sentences)
142
  for j in range(len(sentences)):
 
157
  return sentenceToMaxURL
158
 
159
 
160
+ def google_search(
161
  plag_option,
162
+ sentences,
163
+ url_count,
164
+ score_array,
165
+ url_list,
166
+ sorted_date,
 
 
167
  domains_to_skip,
168
+ api_key,
169
+ cse_id,
170
+ **kwargs,
171
  ):
172
+ service = build("customsearch", "v1", developerKey=api_key)
173
+ for i, sentence in enumerate(sentences):
174
+ results = (
175
+ service.cse()
176
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
177
+ .execute()
178
+ )
179
+ if "items" in results and len(results["items"]) > 0:
180
+ for count, link in enumerate(results["items"]):
181
+ # stop after 3 pages
182
+ if count >= 3:
183
+ break
184
+ # skip user selected domains
185
+ if (domains_to_skip is not None) and any(
186
+ ("." + domain) in link["link"] for domain in domains_to_skip
187
+ ):
188
+ continue
189
+ # clean up snippet of '...'
190
+ snippet = link["snippet"]
191
+ ind = snippet.find("...")
192
+ if ind < 20 and ind > 9:
193
+ snippet = snippet[ind + len("... ") :]
194
+ ind = snippet.find("...")
195
+ if ind > len(snippet) - 5:
196
+ snippet = snippet[:ind]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ # update cosine similarity between snippet and given text
199
+ url = link["link"]
200
+ if url not in url_list:
201
+ url_list.append(url)
202
+ score_array.append([0] * len(sentences))
203
+ url_count[url] = url_count[url] + 1 if url in url_count else 1
204
+ if plag_option == "Standard":
205
+ score_array[url_list.index(url)][i] = cosineSim(
206
+ sentence, snippet
207
+ )
208
+ else:
209
+ score_array[url_list.index(url)][i] = sentence_similarity(
210
+ sentence, snippet
211
+ )
212
+ return url_count, score_array
213
 
214
 
215
  def plagiarism_check(
 
222
  month_to,
223
  day_to,
224
  domains_to_skip,
225
+ source_block_size,
226
  ):
227
  api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
228
  api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
229
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
230
+ # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
231
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
232
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
233
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
234
  cse_id = "851813e81162b4ed4"
235
 
236
  url_scores = []
237
  sentence_scores = []
238
+ sentences = split_sentence_blocks(input, source_block_size)
239
  url_count = {}
240
  score_array = []
241
  url_list = []
 
304
  )
305
 
306
  return sentence_scores, url_scores
307
+
308
+
309
+ def html_highlight(
310
+ plag_option,
311
+ input,
312
+ year_from,
313
+ month_from,
314
+ day_from,
315
+ year_to,
316
+ month_to,
317
+ day_to,
318
+ domains_to_skip,
319
+ source_block_size,
320
+ ):
321
+ sentence_scores, url_scores = plagiarism_check(
322
+ plag_option,
323
+ input,
324
+ year_from,
325
+ month_from,
326
+ day_from,
327
+ year_to,
328
+ month_to,
329
+ day_to,
330
+ domains_to_skip,
331
+ source_block_size,
332
+ )
333
+
334
+ html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
335
+ prev_idx = None
336
+ combined_sentence = ""
337
+ for sentence, _, _, idx in sentence_scores:
338
+ if idx != prev_idx and prev_idx is not None:
339
+ color = color_map[prev_idx - 1]
340
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
341
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
342
+ html_content += formatted_sentence
343
+ combined_sentence = ""
344
+ combined_sentence += " " + sentence
345
+ prev_idx = idx
346
+
347
+ if combined_sentence:
348
+ color = color_map[prev_idx - 1]
349
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
350
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
351
+ html_content += formatted_sentence
352
+
353
+ html_content += "<hr>"
354
+ for url, score, idx in url_scores:
355
+ color = color_map[idx - 1]
356
+ formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
357
+ html_content += formatted_url
358
+
359
+ html_content += "</div>"
360
+
361
+ return html_content
predictors.py CHANGED
@@ -1,23 +1,11 @@
1
- import requests
2
- import httpx
3
  import torch
4
- import re
5
- from bs4 import BeautifulSoup
6
  import numpy as np
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
- import asyncio
9
- from evaluate import load
10
- from datetime import date
11
  import nltk
12
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
13
- import plotly.graph_objects as go
14
  import torch.nn.functional as F
15
  import nltk
16
- from unidecode import unidecode
17
- import time
18
  from scipy.special import softmax
19
  import yaml
20
- import os
21
  from utils import *
22
  import joblib
23
  from optimum.bettertransformer import BetterTransformer
@@ -64,24 +52,9 @@ tokenizers_1on1 = {}
64
  models_1on1 = {}
65
  for model_name, model in zip(mc_label_map, text_1on1_models):
66
  tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
67
- models_1on1[model_name] = AutoModelForSequenceClassification.from_pretrained(
68
- model
69
- ).to(device)
70
-
71
-
72
- bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
73
- tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
74
- bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
75
- bias_checker = pipeline(
76
- "text-classification",
77
- model=bias_checker_model_name,
78
- tokenizer=bias_checker_model_name,
79
- )
80
- gc.collect()
81
- bias_corrector = pipeline(
82
- "text2text-generation", model=bias_corrector_model_name, accelerator="ort"
83
-
84
- )
85
 
86
  # proxy models for explainability
87
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
@@ -90,7 +63,9 @@ bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
90
  mini_bc_model_name
91
  ).to(device_needed)
92
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
93
- humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
 
 
94
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
95
  mini_humanizer_model_name
96
  ).to(device_needed)
@@ -289,9 +264,52 @@ def predict_mc(model, tokenizer, text):
289
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
290
  return output_norm
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  def predict_bc_scores(input):
293
  bc_scores = []
294
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
 
 
295
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
296
  for i in range(samples_len_bc):
297
  cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -300,7 +318,9 @@ def predict_bc_scores(input):
300
  bc_scores_array = np.array(bc_scores)
301
  average_bc_scores = np.mean(bc_scores_array, axis=0)
302
  bc_score_list = average_bc_scores.tolist()
303
- print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
 
 
304
  # isotonic regression calibration
305
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
306
  human_score = 1 - ai_score
@@ -335,7 +355,9 @@ def predict_1on1_combined(input):
335
 
336
 
337
  def predict_1on1_single(input, model):
338
- predictions = predict_1on1(models_1on1[model], tokenizers_1on1[model], input)[1]
 
 
339
  return predictions
340
 
341
 
@@ -347,7 +369,9 @@ def predict_mc_scores(input, models):
347
  print(f"Models to Test: {models}")
348
  # BC SCORE
349
  bc_scores = []
350
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
 
 
351
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
352
  for i in range(samples_len_bc):
353
  cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -356,24 +380,30 @@ def predict_mc_scores(input, models):
356
  bc_scores_array = np.array(bc_scores)
357
  average_bc_scores = np.mean(bc_scores_array, axis=0)
358
  bc_score_list = average_bc_scores.tolist()
359
- print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
 
 
360
  # isotonic regression calibration
361
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
362
  human_score = 1 - ai_score
363
  bc_score = {"AI": ai_score, "HUMAN": human_score}
364
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
365
-
366
  # MC SCORE
367
  if len(models) > 1:
368
  print("Starting MC")
369
  mc_scores = []
370
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
371
  samples_len_mc = len(
372
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
373
  )
374
  for i in range(samples_len_mc):
375
  cleaned_text_mc = remove_special_characters(segments_mc[i])
376
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
 
 
377
  mc_scores.append(mc_score)
378
  mc_scores_array = np.array(mc_scores)
379
  average_mc_scores = np.mean(mc_scores_array, axis=0)
@@ -383,7 +413,9 @@ def predict_mc_scores(input, models):
383
  mc_score[label.upper()] = score
384
 
385
  mc_score = {
386
- key: mc_score[key.upper()] for key in models if key.upper() in mc_score
 
 
387
  }
388
  total = sum(mc_score.values())
389
  # Normalize each value by dividing it by the total
@@ -391,14 +423,16 @@ def predict_mc_scores(input, models):
391
  sum_prob = 1 - bc_score["HUMAN"]
392
  for key, value in mc_score.items():
393
  mc_score[key] = value * sum_prob
394
- print('MC Score:',mc_score)
395
  if sum_prob < 0.01:
396
  mc_score = {}
397
 
398
  elif len(models) == 1:
399
  print("Starting 1on1")
400
  mc_scores = []
401
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
402
  samples_len_mc = len(
403
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
404
  )
 
 
 
1
  import torch
 
 
2
  import numpy as np
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
4
  import nltk
 
 
5
  import torch.nn.functional as F
6
  import nltk
 
 
7
  from scipy.special import softmax
8
  import yaml
 
9
  from utils import *
10
  import joblib
11
  from optimum.bettertransformer import BetterTransformer
 
52
  models_1on1 = {}
53
  for model_name, model in zip(mc_label_map, text_1on1_models):
54
  tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
55
+ models_1on1[model_name] = (
56
+ AutoModelForSequenceClassification.from_pretrained(model).to(device)
57
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # proxy models for explainability
60
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
 
63
  mini_bc_model_name
64
  ).to(device_needed)
65
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
66
+ humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
67
+ mini_humanizer_model_name
68
+ )
69
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
70
  mini_humanizer_model_name
71
  ).to(device_needed)
 
264
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
265
  return output_norm
266
 
267
+
268
+ def predict_mc_scores(input):
269
+ bc_scores = []
270
+ mc_scores = []
271
+
272
+ samples_len_bc = len(
273
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
274
+ )
275
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
276
+ for i in range(samples_len_bc):
277
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
278
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
279
+ bc_scores.append(bc_score)
280
+ bc_scores_array = np.array(bc_scores)
281
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
282
+ bc_score_list = average_bc_scores.tolist()
283
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
284
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
285
+ samples_len_mc = len(
286
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
287
+ )
288
+ for i in range(samples_len_mc):
289
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
290
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
291
+ mc_scores.append(mc_score)
292
+ mc_scores_array = np.array(mc_scores)
293
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
294
+ mc_score_list = average_mc_scores.tolist()
295
+ mc_score = {}
296
+ for score, label in zip(mc_score_list, mc_label_map):
297
+ mc_score[label.upper()] = score
298
+
299
+ sum_prob = 1 - bc_score["HUMAN"]
300
+ for key, value in mc_score.items():
301
+ mc_score[key] = value * sum_prob
302
+ if sum_prob < 0.01:
303
+ mc_score = {}
304
+
305
+ return mc_score
306
+
307
+
308
  def predict_bc_scores(input):
309
  bc_scores = []
310
+ samples_len_bc = len(
311
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
312
+ )
313
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
314
  for i in range(samples_len_bc):
315
  cleaned_text_bc = remove_special_characters(segments_bc[i])
 
318
  bc_scores_array = np.array(bc_scores)
319
  average_bc_scores = np.mean(bc_scores_array, axis=0)
320
  bc_score_list = average_bc_scores.tolist()
321
+ print(
322
+ f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
323
+ )
324
  # isotonic regression calibration
325
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
326
  human_score = 1 - ai_score
 
355
 
356
 
357
  def predict_1on1_single(input, model):
358
+ predictions = predict_1on1(
359
+ models_1on1[model], tokenizers_1on1[model], input
360
+ )[1]
361
  return predictions
362
 
363
 
 
369
  print(f"Models to Test: {models}")
370
  # BC SCORE
371
  bc_scores = []
372
+ samples_len_bc = len(
373
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
374
+ )
375
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
376
  for i in range(samples_len_bc):
377
  cleaned_text_bc = remove_special_characters(segments_bc[i])
 
380
  bc_scores_array = np.array(bc_scores)
381
  average_bc_scores = np.mean(bc_scores_array, axis=0)
382
  bc_score_list = average_bc_scores.tolist()
383
+ print(
384
+ f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
385
+ )
386
  # isotonic regression calibration
387
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
388
  human_score = 1 - ai_score
389
  bc_score = {"AI": ai_score, "HUMAN": human_score}
390
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
391
+
392
  # MC SCORE
393
  if len(models) > 1:
394
  print("Starting MC")
395
  mc_scores = []
396
+ segments_mc = split_text_allow_complete_sentences_nltk(
397
+ input, type_det="mc"
398
+ )
399
  samples_len_mc = len(
400
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
401
  )
402
  for i in range(samples_len_mc):
403
  cleaned_text_mc = remove_special_characters(segments_mc[i])
404
+ mc_score = predict_mc(
405
+ text_mc_model, text_mc_tokenizer, cleaned_text_mc
406
+ )
407
  mc_scores.append(mc_score)
408
  mc_scores_array = np.array(mc_scores)
409
  average_mc_scores = np.mean(mc_scores_array, axis=0)
 
413
  mc_score[label.upper()] = score
414
 
415
  mc_score = {
416
+ key: mc_score[key.upper()]
417
+ for key in models
418
+ if key.upper() in mc_score
419
  }
420
  total = sum(mc_score.values())
421
  # Normalize each value by dividing it by the total
 
423
  sum_prob = 1 - bc_score["HUMAN"]
424
  for key, value in mc_score.items():
425
  mc_score[key] = value * sum_prob
426
+ print("MC Score:", mc_score)
427
  if sum_prob < 0.01:
428
  mc_score = {}
429
 
430
  elif len(models) == 1:
431
  print("Starting 1on1")
432
  mc_scores = []
433
+ segments_mc = split_text_allow_complete_sentences_nltk(
434
+ input, type_det="mc"
435
+ )
436
  samples_len_mc = len(
437
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
438
  )
requirements.txt CHANGED
@@ -16,7 +16,7 @@ joblib
16
  evaluate
17
  tensorflow
18
  keras
19
- spacy
20
  textstat
21
  plotly
22
  tqdm
 
16
  evaluate
17
  tensorflow
18
  keras
19
+ spacy==3.7.2
20
  textstat
21
  plotly
22
  tqdm
utils.py CHANGED
@@ -1,28 +1,11 @@
1
- from urllib.request import urlopen, Request
2
- from googleapiclient.discovery import build
3
- import requests
4
- import httpx
5
  import re
6
- from bs4 import BeautifulSoup
7
- import re, math
8
- from collections import Counter
9
- import numpy as np
10
- import asyncio
11
- import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
- import threading
14
- import torch
15
  import re
16
- import numpy as np
17
- import asyncio
18
- from datetime import date
19
- import nltk
20
  from unidecode import unidecode
21
- from scipy.special import softmax
22
  from transformers import AutoTokenizer
23
  import yaml
24
  import fitz
25
- import os
26
 
27
 
28
  def remove_accents(input_str):
@@ -63,9 +46,6 @@ def update_character_count(text):
63
  return f"{len(text)} characters"
64
 
65
 
66
- nltk.download("punkt")
67
-
68
-
69
  with open("config.yaml", "r") as file:
70
  params = yaml.safe_load(file)
71
 
@@ -92,4 +72,4 @@ def extract_text_from_pdf(pdf_path):
92
 
93
 
94
  WORD = re.compile(r"\w+")
95
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
1
  import re
2
+ import re
 
 
 
 
 
3
  from sentence_transformers import SentenceTransformer, util
 
 
4
  import re
 
 
 
 
5
  from unidecode import unidecode
 
6
  from transformers import AutoTokenizer
7
  import yaml
8
  import fitz
 
9
 
10
 
11
  def remove_accents(input_str):
 
46
  return f"{len(text)} characters"
47
 
48
 
 
 
 
49
  with open("config.yaml", "r") as file:
50
  params = yaml.safe_load(file)
51
 
 
72
 
73
 
74
  WORD = re.compile(r"\w+")
75
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")