Spaces:
Running
Running
update HTML viewer
Browse files- app.py +0 -10
- plagiarism.py +55 -83
app.py
CHANGED
@@ -224,16 +224,6 @@ with gr.Blocks() as demo:
|
|
224 |
|
225 |
with gr.Row():
|
226 |
with gr.Column():
|
227 |
-
# sentenceBreakdown = gr.HighlightedText(
|
228 |
-
# label="Source Detection Sentence Breakdown",
|
229 |
-
# combine_adjacent=True,
|
230 |
-
# color_map={
|
231 |
-
# "[1]": "red",
|
232 |
-
# "[2]": "orange",
|
233 |
-
# "[3]": "yellow",
|
234 |
-
# "[4]": "green",
|
235 |
-
# },
|
236 |
-
# )
|
237 |
sentenceBreakdown = gr.HTML(
|
238 |
label="Source Detection Sentence Breakdown",
|
239 |
value="Source Detection Sentence Breakdown",
|
|
|
224 |
|
225 |
with gr.Row():
|
226 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
sentenceBreakdown = gr.HTML(
|
228 |
label="Source Detection Sentence Breakdown",
|
229 |
value="Source Detection Sentence Breakdown",
|
plagiarism.py
CHANGED
@@ -19,8 +19,6 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
19 |
# returns cosine similarity of two vectors
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
22 |
-
|
23 |
-
|
24 |
def get_cosine(vec1, vec2):
|
25 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
26 |
|
@@ -129,14 +127,14 @@ def google_search(
|
|
129 |
|
130 |
|
131 |
def split_sentence_blocks(text):
|
132 |
-
|
133 |
-
sents = sent_tokenize(text)
|
134 |
two_sents = []
|
135 |
-
for
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
140 |
return two_sents
|
141 |
|
142 |
|
@@ -216,6 +214,26 @@ def print2d(array):
|
|
216 |
print(row)
|
217 |
|
218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
def html_highlight(
|
220 |
plag_option,
|
221 |
input,
|
@@ -239,24 +257,38 @@ def html_highlight(
|
|
239 |
domains_to_skip,
|
240 |
)
|
241 |
color_map = [
|
242 |
-
"#
|
243 |
"#eb9d59",
|
244 |
"#c2ad36",
|
245 |
"#e1ed72",
|
246 |
"#c2db76",
|
247 |
"#a2db76",
|
248 |
]
|
249 |
-
|
|
|
|
|
|
|
250 |
for sentence, _, _, idx in sentence_scores:
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
html_content += formatted_sentence
|
254 |
|
255 |
html_content += "<hr>"
|
256 |
for url, score, idx in url_scores:
|
257 |
color = color_map[idx - 1]
|
258 |
-
|
259 |
-
html_content +=
|
260 |
|
261 |
html_content += "</div>"
|
262 |
|
@@ -278,13 +310,11 @@ def plagiarism_check(
|
|
278 |
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
279 |
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
280 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
281 |
-
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
282 |
cse_id = "851813e81162b4ed4"
|
283 |
|
284 |
url_scores = []
|
285 |
sentence_scores = []
|
286 |
-
# for input in input.split("\n\n"):
|
287 |
-
print(input)
|
288 |
sentences = split_sentence_blocks(input)
|
289 |
url_count = {}
|
290 |
score_array = []
|
@@ -305,21 +335,7 @@ def plagiarism_check(
|
|
305 |
cse_id,
|
306 |
)
|
307 |
# Scrape URLs in list
|
308 |
-
formatted_tokens = []
|
309 |
soups = asyncio.run(parallel_scrap(url_list))
|
310 |
-
|
311 |
-
# # Populate matching scores for scrapped pages
|
312 |
-
# for i, soup in enumerate(soups):
|
313 |
-
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
314 |
-
# if soup:
|
315 |
-
# page_content = soup.text
|
316 |
-
|
317 |
-
# for j, sent in enumerate(sentences):
|
318 |
-
# args_list = (sent, page_content)
|
319 |
-
# score = matching_score(args_list)
|
320 |
-
# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
321 |
-
# score_array[i][j] = score
|
322 |
-
|
323 |
input_data = []
|
324 |
for i, soup in enumerate(soups):
|
325 |
if soup:
|
@@ -336,29 +352,7 @@ def plagiarism_check(
|
|
336 |
score_array[i][j] = scores[k]
|
337 |
k += 1
|
338 |
|
339 |
-
|
340 |
-
# for consecutive sentences
|
341 |
-
sentenceToMaxURL = [-1] * len(sentences)
|
342 |
-
for j in range(len(sentences)):
|
343 |
-
if j > 0:
|
344 |
-
maxScore = score_array[sentenceToMaxURL[j - 1]][j]
|
345 |
-
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
346 |
-
else:
|
347 |
-
maxScore = -1
|
348 |
-
|
349 |
-
for i in range(len(score_array)):
|
350 |
-
margin = (
|
351 |
-
0.05
|
352 |
-
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
353 |
-
else 0
|
354 |
-
)
|
355 |
-
if score_array[i][j] - maxScore > margin:
|
356 |
-
maxScore = score_array[i][j]
|
357 |
-
sentenceToMaxURL[j] = i
|
358 |
-
# if score_array[i][j] > maxScore:
|
359 |
-
# maxScore = score_array[i][j]
|
360 |
-
# sentenceToMaxURL[j] = i
|
361 |
-
|
362 |
index = np.unique(sentenceToMaxURL)
|
363 |
|
364 |
url_source = {}
|
@@ -369,13 +363,12 @@ def plagiarism_check(
|
|
369 |
if sentenceToMaxURL[sen] == url
|
370 |
]
|
371 |
url_source[url] = sum(s) / len(s)
|
372 |
-
|
373 |
index_descending = sorted(url_source, key=url_source.get, reverse=True)
|
374 |
-
|
375 |
urlMap = {}
|
376 |
for count, i in enumerate(index_descending):
|
377 |
urlMap[i] = count + 1
|
378 |
|
|
|
379 |
for i, sent in enumerate(sentences):
|
380 |
ind = sentenceToMaxURL[i]
|
381 |
if url_source[ind] > 0.1:
|
@@ -383,32 +376,11 @@ def plagiarism_check(
|
|
383 |
[sent, url_source[ind], url_list[ind], urlMap[ind]]
|
384 |
)
|
385 |
else:
|
386 |
-
sentence_scores.append([sent, None, url_list[ind],
|
387 |
for ind in index_descending:
|
388 |
-
|
389 |
-
|
390 |
-
|
|
|
391 |
|
392 |
return sentence_scores, url_scores
|
393 |
-
|
394 |
-
# for i, sent in enumerate(sentences):
|
395 |
-
# formatted_tokens.append(
|
396 |
-
# (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
397 |
-
# )
|
398 |
-
|
399 |
-
# formatted_tokens.append(("\n", None))
|
400 |
-
# formatted_tokens.append(("\n", None))
|
401 |
-
# formatted_tokens.append(("\n", None))
|
402 |
-
|
403 |
-
# for ind in index_descending:
|
404 |
-
# formatted_tokens.append(
|
405 |
-
# (
|
406 |
-
# url_list[ind]
|
407 |
-
# + " --- Matching Score: "
|
408 |
-
# + f"{str(round(url_source[ind] * 100, 2))}%",
|
409 |
-
# "[" + str(urlMap[ind]) + "]",
|
410 |
-
# )
|
411 |
-
# )
|
412 |
-
# formatted_tokens.append(("\n", None))
|
413 |
-
|
414 |
-
# return formatted_tokens
|
|
|
19 |
# returns cosine similarity of two vectors
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
|
|
|
|
22 |
def get_cosine(vec1, vec2):
|
23 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
24 |
|
|
|
127 |
|
128 |
|
129 |
def split_sentence_blocks(text):
|
|
|
|
|
130 |
two_sents = []
|
131 |
+
for para in text.split("\n\n"):
|
132 |
+
sents = sent_tokenize(para)
|
133 |
+
for i in range(len(sents)):
|
134 |
+
if (i % 2) == 0:
|
135 |
+
two_sents.append(sents[i])
|
136 |
+
else:
|
137 |
+
two_sents[len(two_sents) - 1] += " " + sents[i]
|
138 |
return two_sents
|
139 |
|
140 |
|
|
|
214 |
print(row)
|
215 |
|
216 |
|
217 |
+
def map_sentence_url(sentences, score_array):
|
218 |
+
sentenceToMaxURL = [-1] * len(sentences)
|
219 |
+
for j in range(len(sentences)):
|
220 |
+
if j > 0:
|
221 |
+
maxScore = score_array[sentenceToMaxURL[j - 1]][j]
|
222 |
+
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
223 |
+
else:
|
224 |
+
maxScore = -1
|
225 |
+
for i in range(len(score_array)):
|
226 |
+
margin = (
|
227 |
+
0.05
|
228 |
+
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
229 |
+
else 0
|
230 |
+
)
|
231 |
+
if score_array[i][j] - maxScore > margin:
|
232 |
+
maxScore = score_array[i][j]
|
233 |
+
sentenceToMaxURL[j] = i
|
234 |
+
return sentenceToMaxURL
|
235 |
+
|
236 |
+
|
237 |
def html_highlight(
|
238 |
plag_option,
|
239 |
input,
|
|
|
257 |
domains_to_skip,
|
258 |
)
|
259 |
color_map = [
|
260 |
+
"#cf2323",
|
261 |
"#eb9d59",
|
262 |
"#c2ad36",
|
263 |
"#e1ed72",
|
264 |
"#c2db76",
|
265 |
"#a2db76",
|
266 |
]
|
267 |
+
font = "Roboto"
|
268 |
+
html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
|
269 |
+
prev_idx = None
|
270 |
+
combined_sentence = ""
|
271 |
for sentence, _, _, idx in sentence_scores:
|
272 |
+
if idx != prev_idx and prev_idx is not None:
|
273 |
+
color = color_map[prev_idx - 1]
|
274 |
+
index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
|
275 |
+
formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
|
276 |
+
html_content += formatted_sentence
|
277 |
+
combined_sentence = ""
|
278 |
+
combined_sentence += " " + sentence
|
279 |
+
prev_idx = idx
|
280 |
+
|
281 |
+
if combined_sentence:
|
282 |
+
color = color_map[prev_idx - 1]
|
283 |
+
index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
|
284 |
+
formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
|
285 |
html_content += formatted_sentence
|
286 |
|
287 |
html_content += "<hr>"
|
288 |
for url, score, idx in url_scores:
|
289 |
color = color_map[idx - 1]
|
290 |
+
formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
|
291 |
+
html_content += formatted_url
|
292 |
|
293 |
html_content += "</div>"
|
294 |
|
|
|
310 |
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
311 |
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
312 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
313 |
+
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
314 |
cse_id = "851813e81162b4ed4"
|
315 |
|
316 |
url_scores = []
|
317 |
sentence_scores = []
|
|
|
|
|
318 |
sentences = split_sentence_blocks(input)
|
319 |
url_count = {}
|
320 |
score_array = []
|
|
|
335 |
cse_id,
|
336 |
)
|
337 |
# Scrape URLs in list
|
|
|
338 |
soups = asyncio.run(parallel_scrap(url_list))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
input_data = []
|
340 |
for i, soup in enumerate(soups):
|
341 |
if soup:
|
|
|
352 |
score_array[i][j] = scores[k]
|
353 |
k += 1
|
354 |
|
355 |
+
sentenceToMaxURL = map_sentence_url(sentences, score_array)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
index = np.unique(sentenceToMaxURL)
|
357 |
|
358 |
url_source = {}
|
|
|
363 |
if sentenceToMaxURL[sen] == url
|
364 |
]
|
365 |
url_source[url] = sum(s) / len(s)
|
|
|
366 |
index_descending = sorted(url_source, key=url_source.get, reverse=True)
|
|
|
367 |
urlMap = {}
|
368 |
for count, i in enumerate(index_descending):
|
369 |
urlMap[i] = count + 1
|
370 |
|
371 |
+
# build results
|
372 |
for i, sent in enumerate(sentences):
|
373 |
ind = sentenceToMaxURL[i]
|
374 |
if url_source[ind] > 0.1:
|
|
|
376 |
[sent, url_source[ind], url_list[ind], urlMap[ind]]
|
377 |
)
|
378 |
else:
|
379 |
+
sentence_scores.append([sent, None, url_list[ind], -1])
|
380 |
for ind in index_descending:
|
381 |
+
if url_source[ind] > 0.1:
|
382 |
+
url_scores.append(
|
383 |
+
[url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
|
384 |
+
)
|
385 |
|
386 |
return sentence_scores, url_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|