minko186 commited on
Commit
8e79582
1 Parent(s): d03ef17

speed up plagiarism

Browse files
Files changed (1) hide show
  1. plagiarism.py +24 -19
plagiarism.py CHANGED
@@ -33,8 +33,13 @@ months = {
33
 
34
  color_map = [
35
  "#cf2323",
 
 
 
36
  "#eb9d59",
37
  "#c2ad36",
 
 
38
  "#e1ed72",
39
  "#c2db76",
40
  "#a2db76",
@@ -114,25 +119,25 @@ def matching_score(sentence_content_tuple):
114
  sentence, content, score = sentence_content_tuple
115
  if sentence in content:
116
  return 1
117
- # if score > 0.9:
118
- # return score
119
  else:
120
  n = 5
121
 
122
- ngrams = split_ngrams(sentence, n)
123
- if len(ngrams) == 0:
124
- return 0
125
- matched = [x for x in ngrams if " ".join(x) in content]
126
- return len(matched) / len(ngrams)
127
-
128
- # ngrams_sentence = split_ngrams(sentence, n)
129
- # if len(ngrams_sentence) == 0:
130
  # return 0
131
- # ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
132
- # matched_count = sum(
133
- # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
134
- # )
135
- # return matched_count / len(ngrams_sentence)
 
 
 
 
 
 
136
 
137
 
138
  def process_with_multiprocessing(input_data):
@@ -174,6 +179,7 @@ def google_search(
174
  **kwargs,
175
  ):
176
  service = build("customsearch", "v1", developerKey=api_key)
 
177
  for i, sentence in enumerate(sentences):
178
  results = (
179
  service.cse()
@@ -182,8 +188,7 @@ def google_search(
182
  )
183
  if "items" in results and len(results["items"]) > 0:
184
  for count, link in enumerate(results["items"]):
185
- # stop after 3 pages
186
- if count >= 3:
187
  break
188
  # skip user selected domains
189
  if (domains_to_skip is not None) and any(
@@ -228,11 +233,11 @@ def plagiarism_check(
228
  domains_to_skip,
229
  source_block_size,
230
  ):
231
- api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
232
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
233
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
234
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
235
- # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
236
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
237
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
238
  cse_id = "851813e81162b4ed4"
 
33
 
34
  color_map = [
35
  "#cf2323",
36
+ "#d65129",
37
+ "#d66329",
38
+ "#d67129",
39
  "#eb9d59",
40
  "#c2ad36",
41
+ "#d6ae29",
42
+ "#d6b929",
43
  "#e1ed72",
44
  "#c2db76",
45
  "#a2db76",
 
119
  sentence, content, score = sentence_content_tuple
120
  if sentence in content:
121
  return 1
122
+ if score > 0.9:
123
+ return score
124
  else:
125
  n = 5
126
 
127
+ # ngrams = split_ngrams(sentence, n)
128
+ # if len(ngrams) == 0:
 
 
 
 
 
 
129
  # return 0
130
+ # matched = [x for x in ngrams if " ".join(x) in content]
131
+ # return len(matched) / len(ngrams)
132
+
133
+ ngrams_sentence = split_ngrams(sentence, n)
134
+ if len(ngrams_sentence) == 0:
135
+ return 0
136
+ ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
137
+ matched_count = sum(
138
+ 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
139
+ )
140
+ return matched_count / len(ngrams_sentence)
141
 
142
 
143
  def process_with_multiprocessing(input_data):
 
179
  **kwargs,
180
  ):
181
  service = build("customsearch", "v1", developerKey=api_key)
182
+ num_pages = 3
183
  for i, sentence in enumerate(sentences):
184
  results = (
185
  service.cse()
 
188
  )
189
  if "items" in results and len(results["items"]) > 0:
190
  for count, link in enumerate(results["items"]):
191
+ if count >= num_pages:
 
192
  break
193
  # skip user selected domains
194
  if (domains_to_skip is not None) and any(
 
233
  domains_to_skip,
234
  source_block_size,
235
  ):
236
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
237
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
238
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
239
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
240
+ api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
241
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
242
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
243
  cse_id = "851813e81162b4ed4"