aliasgerovs commited on
Commit
7f62749
1 Parent(s): 93a5ec6
Files changed (1) hide show
  1. plagiarism.py +26 -9
plagiarism.py CHANGED
@@ -155,10 +155,25 @@ def matching_score(sentence_content_tuple):
155
  # matched = [x for x in ngrams if " ".join(x) in content]
156
  # return len(matched) / len(ngrams)
157
 
158
- ngrams_sentence = split_ngrams(sentence, n)
 
 
 
 
 
 
 
 
 
 
 
 
159
  if len(ngrams_sentence) == 0:
160
  return 0, ""
161
- ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
 
 
 
162
  matched_content_ngrams = []
163
  found = False
164
  last_found = None
@@ -174,10 +189,6 @@ def matching_score(sentence_content_tuple):
174
  )
175
  matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
176
 
177
- matched_ngrams = [
178
- 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
179
- ]
180
- matched_count = sum(matched_ngrams)
181
  return matched_count / len(ngrams_sentence), matched_content
182
 
183
 
@@ -229,7 +240,7 @@ def google_search(
229
  **kwargs,
230
  ):
231
  service = build("customsearch", "v1", developerKey=api_key)
232
- num_pages = 3
233
  for i, sentence in enumerate(sentences):
234
  results = (
235
  service.cse()
@@ -287,10 +298,10 @@ def plagiarism_check(
287
  ):
288
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
289
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
290
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
291
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
292
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
293
- api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
294
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
295
  cse_id = "851813e81162b4ed4"
296
 
@@ -305,6 +316,7 @@ def plagiarism_check(
305
  date_to = build_date(year_to, month_to, day_to)
306
  sort_date = f"date:r:{date_from}:{date_to}"
307
  # get list of URLS to check
 
308
  url_count, score_array = google_search(
309
  plag_option,
310
  sentences,
@@ -317,15 +329,20 @@ def plagiarism_check(
317
  api_key,
318
  cse_id,
319
  )
 
320
  # Scrape URLs in list
 
321
  soups = asyncio.run(parallel_scrap(url_list))
 
322
  input_data = []
323
  for i, soup in enumerate(soups):
324
  if soup:
325
  page_content = soup.text
326
  for j, sent in enumerate(sentences):
327
  input_data.append((sent, page_content, score_array[i][j]))
 
328
  scores = process_with_multiprocessing(input_data)
 
329
  matched_sentence_array = [
330
  ["" for _ in range(len(score_array[0]))]
331
  for _ in range(len(score_array))
 
155
  # matched = [x for x in ngrams if " ".join(x) in content]
156
  # return len(matched) / len(ngrams)
157
 
158
+ # list comprehension matching
159
+ # ngrams_sentence = split_ngrams(sentence, n)
160
+ # ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
161
+ # if len(ngrams_sentence) == 0:
162
+ # return 0, ""
163
+ # matched_ngrams = [
164
+ # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
165
+ # ]
166
+ # matched_count = sum(matched_ngrams)
167
+
168
+ # set intersection matching
169
+ ngrams_sentence = set(split_ngrams(sentence, n))
170
+ ngrams_content = set(split_ngrams(content, n))
171
  if len(ngrams_sentence) == 0:
172
  return 0, ""
173
+ matched_ngrams = ngrams_sentence.intersection(ngrams_content)
174
+ matched_count = len(matched_ngrams)
175
+
176
+ # matched content
177
  matched_content_ngrams = []
178
  found = False
179
  last_found = None
 
189
  )
190
  matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
191
 
 
 
 
 
192
  return matched_count / len(ngrams_sentence), matched_content
193
 
194
 
 
240
  **kwargs,
241
  ):
242
  service = build("customsearch", "v1", developerKey=api_key)
243
+ num_pages = 1
244
  for i, sentence in enumerate(sentences):
245
  results = (
246
  service.cse()
 
298
  ):
299
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
300
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
301
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
302
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
303
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
304
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
305
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
306
  cse_id = "851813e81162b4ed4"
307
 
 
316
  date_to = build_date(year_to, month_to, day_to)
317
  sort_date = f"date:r:{date_from}:{date_to}"
318
  # get list of URLS to check
319
+ start_time = time.perf_counter()
320
  url_count, score_array = google_search(
321
  plag_option,
322
  sentences,
 
329
  api_key,
330
  cse_id,
331
  )
332
+ print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
333
  # Scrape URLs in list
334
+ start_time = time.perf_counter()
335
  soups = asyncio.run(parallel_scrap(url_list))
336
+ print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
337
  input_data = []
338
  for i, soup in enumerate(soups):
339
  if soup:
340
  page_content = soup.text
341
  for j, sent in enumerate(sentences):
342
  input_data.append((sent, page_content, score_array[i][j]))
343
+ start_time = time.perf_counter()
344
  scores = process_with_multiprocessing(input_data)
345
+ print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
346
  matched_sentence_array = [
347
  ["" for _ in range(len(score_array[0]))]
348
  for _ in range(len(score_array))