Spaces:
Running
Running
speed up plagiarism
Browse files- plagiarism.py +24 -19
plagiarism.py
CHANGED
@@ -33,8 +33,13 @@ months = {
|
|
33 |
|
34 |
color_map = [
|
35 |
"#cf2323",
|
|
|
|
|
|
|
36 |
"#eb9d59",
|
37 |
"#c2ad36",
|
|
|
|
|
38 |
"#e1ed72",
|
39 |
"#c2db76",
|
40 |
"#a2db76",
|
@@ -114,25 +119,25 @@ def matching_score(sentence_content_tuple):
|
|
114 |
sentence, content, score = sentence_content_tuple
|
115 |
if sentence in content:
|
116 |
return 1
|
117 |
-
|
118 |
-
|
119 |
else:
|
120 |
n = 5
|
121 |
|
122 |
-
ngrams = split_ngrams(sentence, n)
|
123 |
-
if len(ngrams) == 0:
|
124 |
-
return 0
|
125 |
-
matched = [x for x in ngrams if " ".join(x) in content]
|
126 |
-
return len(matched) / len(ngrams)
|
127 |
-
|
128 |
-
# ngrams_sentence = split_ngrams(sentence, n)
|
129 |
-
# if len(ngrams_sentence) == 0:
|
130 |
# return 0
|
131 |
-
#
|
132 |
-
#
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
|
138 |
def process_with_multiprocessing(input_data):
|
@@ -174,6 +179,7 @@ def google_search(
|
|
174 |
**kwargs,
|
175 |
):
|
176 |
service = build("customsearch", "v1", developerKey=api_key)
|
|
|
177 |
for i, sentence in enumerate(sentences):
|
178 |
results = (
|
179 |
service.cse()
|
@@ -182,8 +188,7 @@ def google_search(
|
|
182 |
)
|
183 |
if "items" in results and len(results["items"]) > 0:
|
184 |
for count, link in enumerate(results["items"]):
|
185 |
-
|
186 |
-
if count >= 3:
|
187 |
break
|
188 |
# skip user selected domains
|
189 |
if (domains_to_skip is not None) and any(
|
@@ -228,11 +233,11 @@ def plagiarism_check(
|
|
228 |
domains_to_skip,
|
229 |
source_block_size,
|
230 |
):
|
231 |
-
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
232 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
233 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
234 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
235 |
-
|
236 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
237 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
238 |
cse_id = "851813e81162b4ed4"
|
|
|
33 |
|
34 |
color_map = [
|
35 |
"#cf2323",
|
36 |
+
"#d65129",
|
37 |
+
"#d66329",
|
38 |
+
"#d67129",
|
39 |
"#eb9d59",
|
40 |
"#c2ad36",
|
41 |
+
"#d6ae29",
|
42 |
+
"#d6b929",
|
43 |
"#e1ed72",
|
44 |
"#c2db76",
|
45 |
"#a2db76",
|
|
|
119 |
sentence, content, score = sentence_content_tuple
|
120 |
if sentence in content:
|
121 |
return 1
|
122 |
+
if score > 0.9:
|
123 |
+
return score
|
124 |
else:
|
125 |
n = 5
|
126 |
|
127 |
+
# ngrams = split_ngrams(sentence, n)
|
128 |
+
# if len(ngrams) == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
# return 0
|
130 |
+
# matched = [x for x in ngrams if " ".join(x) in content]
|
131 |
+
# return len(matched) / len(ngrams)
|
132 |
+
|
133 |
+
ngrams_sentence = split_ngrams(sentence, n)
|
134 |
+
if len(ngrams_sentence) == 0:
|
135 |
+
return 0
|
136 |
+
ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
|
137 |
+
matched_count = sum(
|
138 |
+
1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
|
139 |
+
)
|
140 |
+
return matched_count / len(ngrams_sentence)
|
141 |
|
142 |
|
143 |
def process_with_multiprocessing(input_data):
|
|
|
179 |
**kwargs,
|
180 |
):
|
181 |
service = build("customsearch", "v1", developerKey=api_key)
|
182 |
+
num_pages = 3
|
183 |
for i, sentence in enumerate(sentences):
|
184 |
results = (
|
185 |
service.cse()
|
|
|
188 |
)
|
189 |
if "items" in results and len(results["items"]) > 0:
|
190 |
for count, link in enumerate(results["items"]):
|
191 |
+
if count >= num_pages:
|
|
|
192 |
break
|
193 |
# skip user selected domains
|
194 |
if (domains_to_skip is not None) and any(
|
|
|
233 |
domains_to_skip,
|
234 |
source_block_size,
|
235 |
):
|
236 |
+
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
237 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
238 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
239 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
240 |
+
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
241 |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
242 |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
243 |
cse_id = "851813e81162b4ed4"
|