Spaces:
Running
Running
update speed plagiarism
Browse files- plagiarism.py +19 -4
plagiarism.py
CHANGED
@@ -111,17 +111,29 @@ async def parallel_scrap(urls):
|
|
111 |
|
112 |
|
113 |
def matching_score(sentence_content_tuple):
|
114 |
-
sentence, content = sentence_content_tuple
|
115 |
if sentence in content:
|
116 |
return 1
|
|
|
|
|
117 |
else:
|
118 |
n = 5
|
|
|
119 |
ngrams = split_ngrams(sentence, n)
|
120 |
if len(ngrams) == 0:
|
121 |
return 0
|
122 |
matched = [x for x in ngrams if " ".join(x) in content]
|
123 |
return len(matched) / len(ngrams)
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
def process_with_multiprocessing(input_data):
|
127 |
with Pool(processes=4) as pool:
|
@@ -216,12 +228,12 @@ def plagiarism_check(
|
|
216 |
domains_to_skip,
|
217 |
source_block_size,
|
218 |
):
|
219 |
-
|
220 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
221 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
222 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
223 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
224 |
-
api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
225 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
226 |
cse_id = "851813e81162b4ed4"
|
227 |
|
@@ -254,7 +266,7 @@ def plagiarism_check(
|
|
254 |
if soup:
|
255 |
page_content = soup.text
|
256 |
for j, sent in enumerate(sentences):
|
257 |
-
input_data.append((sent, page_content))
|
258 |
scores = process_with_multiprocessing(input_data)
|
259 |
|
260 |
k = 0
|
@@ -311,6 +323,7 @@ def html_highlight(
|
|
311 |
domains_to_skip,
|
312 |
source_block_size,
|
313 |
):
|
|
|
314 |
sentence_scores, url_scores = plagiarism_check(
|
315 |
plag_option,
|
316 |
input,
|
@@ -351,4 +364,6 @@ def html_highlight(
|
|
351 |
|
352 |
html_content += "</div>"
|
353 |
|
|
|
|
|
354 |
return html_content
|
|
|
111 |
|
112 |
|
113 |
def matching_score(sentence_content_tuple):
|
114 |
+
sentence, content, score = sentence_content_tuple
|
115 |
if sentence in content:
|
116 |
return 1
|
117 |
+
# if score > 0.9:
|
118 |
+
# return score
|
119 |
else:
|
120 |
n = 5
|
121 |
+
|
122 |
ngrams = split_ngrams(sentence, n)
|
123 |
if len(ngrams) == 0:
|
124 |
return 0
|
125 |
matched = [x for x in ngrams if " ".join(x) in content]
|
126 |
return len(matched) / len(ngrams)
|
127 |
|
128 |
+
# ngrams_sentence = split_ngrams(sentence, n)
|
129 |
+
# if len(ngrams_sentence) == 0:
|
130 |
+
# return 0
|
131 |
+
# ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
|
132 |
+
# matched_count = sum(
|
133 |
+
# 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
|
134 |
+
# )
|
135 |
+
# return matched_count / len(ngrams_sentence)
|
136 |
+
|
137 |
|
138 |
def process_with_multiprocessing(input_data):
|
139 |
with Pool(processes=4) as pool:
|
|
|
228 |
domains_to_skip,
|
229 |
source_block_size,
|
230 |
):
|
231 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
232 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
233 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
234 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
235 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
236 |
+
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
237 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
238 |
cse_id = "851813e81162b4ed4"
|
239 |
|
|
|
266 |
if soup:
|
267 |
page_content = soup.text
|
268 |
for j, sent in enumerate(sentences):
|
269 |
+
input_data.append((sent, page_content, score_array[i][j]))
|
270 |
scores = process_with_multiprocessing(input_data)
|
271 |
|
272 |
k = 0
|
|
|
323 |
domains_to_skip,
|
324 |
source_block_size,
|
325 |
):
|
326 |
+
start_time = time.perf_counter()
|
327 |
sentence_scores, url_scores = plagiarism_check(
|
328 |
plag_option,
|
329 |
input,
|
|
|
364 |
|
365 |
html_content += "</div>"
|
366 |
|
367 |
+
print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
|
368 |
+
|
369 |
return html_content
|