Spaces:
Running
Running
aliasgerovs
commited on
Commit
•
7f62749
1
Parent(s):
93a5ec6
updated
Browse files- plagiarism.py +26 -9
plagiarism.py
CHANGED
@@ -155,10 +155,25 @@ def matching_score(sentence_content_tuple):
|
|
155 |
# matched = [x for x in ngrams if " ".join(x) in content]
|
156 |
# return len(matched) / len(ngrams)
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
if len(ngrams_sentence) == 0:
|
160 |
return 0, ""
|
161 |
-
|
|
|
|
|
|
|
162 |
matched_content_ngrams = []
|
163 |
found = False
|
164 |
last_found = None
|
@@ -174,10 +189,6 @@ def matching_score(sentence_content_tuple):
|
|
174 |
)
|
175 |
matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
|
176 |
|
177 |
-
matched_ngrams = [
|
178 |
-
1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
|
179 |
-
]
|
180 |
-
matched_count = sum(matched_ngrams)
|
181 |
return matched_count / len(ngrams_sentence), matched_content
|
182 |
|
183 |
|
@@ -229,7 +240,7 @@ def google_search(
|
|
229 |
**kwargs,
|
230 |
):
|
231 |
service = build("customsearch", "v1", developerKey=api_key)
|
232 |
-
num_pages =
|
233 |
for i, sentence in enumerate(sentences):
|
234 |
results = (
|
235 |
service.cse()
|
@@ -287,10 +298,10 @@ def plagiarism_check(
|
|
287 |
):
|
288 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
289 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
290 |
-
|
291 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
292 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
293 |
-
api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
294 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
295 |
cse_id = "851813e81162b4ed4"
|
296 |
|
@@ -305,6 +316,7 @@ def plagiarism_check(
|
|
305 |
date_to = build_date(year_to, month_to, day_to)
|
306 |
sort_date = f"date:r:{date_from}:{date_to}"
|
307 |
# get list of URLS to check
|
|
|
308 |
url_count, score_array = google_search(
|
309 |
plag_option,
|
310 |
sentences,
|
@@ -317,15 +329,20 @@ def plagiarism_check(
|
|
317 |
api_key,
|
318 |
cse_id,
|
319 |
)
|
|
|
320 |
# Scrape URLs in list
|
|
|
321 |
soups = asyncio.run(parallel_scrap(url_list))
|
|
|
322 |
input_data = []
|
323 |
for i, soup in enumerate(soups):
|
324 |
if soup:
|
325 |
page_content = soup.text
|
326 |
for j, sent in enumerate(sentences):
|
327 |
input_data.append((sent, page_content, score_array[i][j]))
|
|
|
328 |
scores = process_with_multiprocessing(input_data)
|
|
|
329 |
matched_sentence_array = [
|
330 |
["" for _ in range(len(score_array[0]))]
|
331 |
for _ in range(len(score_array))
|
|
|
155 |
# matched = [x for x in ngrams if " ".join(x) in content]
|
156 |
# return len(matched) / len(ngrams)
|
157 |
|
158 |
+
# list comprehension matching
|
159 |
+
# ngrams_sentence = split_ngrams(sentence, n)
|
160 |
+
# ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
|
161 |
+
# if len(ngrams_sentence) == 0:
|
162 |
+
# return 0, ""
|
163 |
+
# matched_ngrams = [
|
164 |
+
# 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
|
165 |
+
# ]
|
166 |
+
# matched_count = sum(matched_ngrams)
|
167 |
+
|
168 |
+
# set intersection matching
|
169 |
+
ngrams_sentence = set(split_ngrams(sentence, n))
|
170 |
+
ngrams_content = set(split_ngrams(content, n))
|
171 |
if len(ngrams_sentence) == 0:
|
172 |
return 0, ""
|
173 |
+
matched_ngrams = ngrams_sentence.intersection(ngrams_content)
|
174 |
+
matched_count = len(matched_ngrams)
|
175 |
+
|
176 |
+
# matched content
|
177 |
matched_content_ngrams = []
|
178 |
found = False
|
179 |
last_found = None
|
|
|
189 |
)
|
190 |
matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
|
191 |
|
|
|
|
|
|
|
|
|
192 |
return matched_count / len(ngrams_sentence), matched_content
|
193 |
|
194 |
|
|
|
240 |
**kwargs,
|
241 |
):
|
242 |
service = build("customsearch", "v1", developerKey=api_key)
|
243 |
+
num_pages = 1
|
244 |
for i, sentence in enumerate(sentences):
|
245 |
results = (
|
246 |
service.cse()
|
|
|
298 |
):
|
299 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
300 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
301 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
302 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
303 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
304 |
+
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
305 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
306 |
cse_id = "851813e81162b4ed4"
|
307 |
|
|
|
316 |
date_to = build_date(year_to, month_to, day_to)
|
317 |
sort_date = f"date:r:{date_from}:{date_to}"
|
318 |
# get list of URLS to check
|
319 |
+
start_time = time.perf_counter()
|
320 |
url_count, score_array = google_search(
|
321 |
plag_option,
|
322 |
sentences,
|
|
|
329 |
api_key,
|
330 |
cse_id,
|
331 |
)
|
332 |
+
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
|
333 |
# Scrape URLs in list
|
334 |
+
start_time = time.perf_counter()
|
335 |
soups = asyncio.run(parallel_scrap(url_list))
|
336 |
+
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
337 |
input_data = []
|
338 |
for i, soup in enumerate(soups):
|
339 |
if soup:
|
340 |
page_content = soup.text
|
341 |
for j, sent in enumerate(sentences):
|
342 |
input_data.append((sent, page_content, score_array[i][j]))
|
343 |
+
start_time = time.perf_counter()
|
344 |
scores = process_with_multiprocessing(input_data)
|
345 |
+
print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
|
346 |
matched_sentence_array = [
|
347 |
["" for _ in range(len(score_array[0]))]
|
348 |
for _ in range(len(score_array))
|