Spaces:
Running
Running
added option to choose size of sentence block for source detect
Browse files- analysis.py +27 -9
- app.py +11 -0
- isotonic_regression_model.joblib +0 -0
- plagiarism.py +22 -14
analysis.py
CHANGED
@@ -62,7 +62,10 @@ def depth_analysis(input_text):
|
|
62 |
"punctuation_diversity": (-0.21875, 0.53125),
|
63 |
"type_token_ratio": (0.33002482852189063, 1.0894414982357028),
|
64 |
"calculate_perplexity": (-25.110544681549072, 82.4620680809021),
|
65 |
-
"calculate_syntactic_tree_depth": (
|
|
|
|
|
|
|
66 |
"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
|
67 |
"mtld": (-84.03125000000001, 248.81875000000002),
|
68 |
}
|
@@ -72,14 +75,17 @@ def depth_analysis(input_text):
|
|
72 |
determiner_use = determiners_frequency(input_text, nlp)
|
73 |
punctuation_variety = punctuation_diversity(input_text)
|
74 |
sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
|
75 |
-
perplexity = calculate_perplexity(
|
|
|
|
|
76 |
lexical_diversity = type_token_ratio(input_text)
|
77 |
unique_words = hapax_legomena_ratio(input_text)
|
78 |
vocabulary_stability = mtld(input_text)
|
79 |
|
80 |
# normalize between 0 and 100
|
81 |
vocabulary_level_norm = normalize(
|
82 |
-
vocabulary_level,
|
|
|
83 |
)
|
84 |
entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
|
85 |
determiner_use_norm = normalize(
|
@@ -91,12 +97,18 @@ def depth_analysis(input_text):
|
|
91 |
lexical_diversity_norm = normalize(
|
92 |
lexical_diversity, *usual_ranges["type_token_ratio"]
|
93 |
)
|
94 |
-
unique_words_norm = normalize(
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
sentence_depth_norm = normalize(
|
97 |
sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
|
98 |
)
|
99 |
-
perplexity_norm = normalize(
|
|
|
|
|
100 |
|
101 |
features = {
|
102 |
"Lexical Diversity": lexical_diversity_norm,
|
@@ -161,7 +173,8 @@ def depth_analysis(input_text):
|
|
161 |
path=Path.unit_regular_polygon(num_vars),
|
162 |
)
|
163 |
spine.set_transform(
|
164 |
-
Affine2D().scale(0.5).translate(0.5, 0.5)
|
|
|
165 |
)
|
166 |
return {"polar": spine}
|
167 |
|
@@ -172,14 +185,19 @@ def depth_analysis(input_text):
|
|
172 |
theta = radar_factory(N, frame="polygon")
|
173 |
data = features.values()
|
174 |
labels = features.keys()
|
175 |
-
fig, ax = plt.subplots(
|
|
|
|
|
176 |
ax.plot(theta, data)
|
177 |
ax.fill(theta, data, alpha=0.4)
|
178 |
ax.set_varlabels(labels)
|
179 |
|
180 |
rgrids = np.linspace(0, 100, num=6)
|
181 |
ax.set_rgrids(
|
182 |
-
rgrids,
|
|
|
|
|
|
|
183 |
)
|
184 |
ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
|
185 |
|
|
|
62 |
"punctuation_diversity": (-0.21875, 0.53125),
|
63 |
"type_token_ratio": (0.33002482852189063, 1.0894414982357028),
|
64 |
"calculate_perplexity": (-25.110544681549072, 82.4620680809021),
|
65 |
+
"calculate_syntactic_tree_depth": (
|
66 |
+
1.8380681818181812,
|
67 |
+
10.997159090909092,
|
68 |
+
),
|
69 |
"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
|
70 |
"mtld": (-84.03125000000001, 248.81875000000002),
|
71 |
}
|
|
|
75 |
determiner_use = determiners_frequency(input_text, nlp)
|
76 |
punctuation_variety = punctuation_diversity(input_text)
|
77 |
sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
|
78 |
+
perplexity = calculate_perplexity(
|
79 |
+
input_text, gpt2_model, gpt2_tokenizer, device
|
80 |
+
)
|
81 |
lexical_diversity = type_token_ratio(input_text)
|
82 |
unique_words = hapax_legomena_ratio(input_text)
|
83 |
vocabulary_stability = mtld(input_text)
|
84 |
|
85 |
# normalize between 0 and 100
|
86 |
vocabulary_level_norm = normalize(
|
87 |
+
vocabulary_level,
|
88 |
+
*usual_ranges["estimated_slightly_difficult_words_ratio"],
|
89 |
)
|
90 |
entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
|
91 |
determiner_use_norm = normalize(
|
|
|
97 |
lexical_diversity_norm = normalize(
|
98 |
lexical_diversity, *usual_ranges["type_token_ratio"]
|
99 |
)
|
100 |
+
unique_words_norm = normalize(
|
101 |
+
unique_words, *usual_ranges["hapax_legomena_ratio"]
|
102 |
+
)
|
103 |
+
vocabulary_stability_norm = normalize(
|
104 |
+
vocabulary_stability, *usual_ranges["mtld"]
|
105 |
+
)
|
106 |
sentence_depth_norm = normalize(
|
107 |
sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
|
108 |
)
|
109 |
+
perplexity_norm = normalize(
|
110 |
+
perplexity, *usual_ranges["calculate_perplexity"]
|
111 |
+
)
|
112 |
|
113 |
features = {
|
114 |
"Lexical Diversity": lexical_diversity_norm,
|
|
|
173 |
path=Path.unit_regular_polygon(num_vars),
|
174 |
)
|
175 |
spine.set_transform(
|
176 |
+
Affine2D().scale(0.5).translate(0.5, 0.5)
|
177 |
+
+ self.transAxes
|
178 |
)
|
179 |
return {"polar": spine}
|
180 |
|
|
|
185 |
theta = radar_factory(N, frame="polygon")
|
186 |
data = features.values()
|
187 |
labels = features.keys()
|
188 |
+
fig, ax = plt.subplots(
|
189 |
+
subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
|
190 |
+
)
|
191 |
ax.plot(theta, data)
|
192 |
ax.fill(theta, data, alpha=0.4)
|
193 |
ax.set_varlabels(labels)
|
194 |
|
195 |
rgrids = np.linspace(0, 100, num=6)
|
196 |
ax.set_rgrids(
|
197 |
+
rgrids,
|
198 |
+
labels=[f"{round(r)}%" for r in rgrids],
|
199 |
+
fontsize=8,
|
200 |
+
color="black",
|
201 |
)
|
202 |
ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
|
203 |
|
app.py
CHANGED
@@ -46,6 +46,7 @@ def main(
|
|
46 |
month_to,
|
47 |
day_to,
|
48 |
domains_to_skip,
|
|
|
49 |
):
|
50 |
|
51 |
# formatted_tokens = plagiarism_check(
|
@@ -69,6 +70,7 @@ def main(
|
|
69 |
month_to,
|
70 |
day_to,
|
71 |
domains_to_skip,
|
|
|
72 |
)
|
73 |
depth_analysis_plot = depth_analysis(input)
|
74 |
bc_score = predict_bc_scores(input)
|
@@ -146,6 +148,13 @@ with gr.Blocks() as demo:
|
|
146 |
plag_option = gr.Radio(
|
147 |
["Standard", "Advanced"], label="Choose an option please."
|
148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
with gr.Row():
|
151 |
with gr.Column():
|
@@ -300,6 +309,7 @@ with gr.Blocks() as demo:
|
|
300 |
month_to,
|
301 |
day_to,
|
302 |
domains_to_skip,
|
|
|
303 |
],
|
304 |
outputs=[
|
305 |
bcLabel,
|
@@ -340,6 +350,7 @@ with gr.Blocks() as demo:
|
|
340 |
month_to,
|
341 |
day_to,
|
342 |
domains_to_skip,
|
|
|
343 |
],
|
344 |
outputs=[
|
345 |
sentenceBreakdown,
|
|
|
46 |
month_to,
|
47 |
day_to,
|
48 |
domains_to_skip,
|
49 |
+
source_block_size,
|
50 |
):
|
51 |
|
52 |
# formatted_tokens = plagiarism_check(
|
|
|
70 |
month_to,
|
71 |
day_to,
|
72 |
domains_to_skip,
|
73 |
+
source_block_size,
|
74 |
)
|
75 |
depth_analysis_plot = depth_analysis(input)
|
76 |
bc_score = predict_bc_scores(input)
|
|
|
148 |
plag_option = gr.Radio(
|
149 |
["Standard", "Advanced"], label="Choose an option please."
|
150 |
)
|
151 |
+
with gr.Row():
|
152 |
+
source_block_size = gr.Dropdown(
|
153 |
+
choices=["1", "2", "3", "Paragraph"],
|
154 |
+
label="Source Check Granularity",
|
155 |
+
value="2",
|
156 |
+
interactive=True,
|
157 |
+
)
|
158 |
|
159 |
with gr.Row():
|
160 |
with gr.Column():
|
|
|
309 |
month_to,
|
310 |
day_to,
|
311 |
domains_to_skip,
|
312 |
+
source_block_size,
|
313 |
],
|
314 |
outputs=[
|
315 |
bcLabel,
|
|
|
350 |
month_to,
|
351 |
day_to,
|
352 |
domains_to_skip,
|
353 |
+
source_block_size,
|
354 |
],
|
355 |
outputs=[
|
356 |
sentenceBreakdown,
|
isotonic_regression_model.joblib
CHANGED
Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ
|
|
plagiarism.py
CHANGED
@@ -66,16 +66,21 @@ def get_cosine(vec1, vec2):
|
|
66 |
return float(numerator) / denominator
|
67 |
|
68 |
|
69 |
-
def split_sentence_blocks(text):
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
|
81 |
def build_date(year=2024, month="March", day=1):
|
@@ -177,7 +182,7 @@ def google_search(
|
|
177 |
if count >= 3:
|
178 |
break
|
179 |
# skip user selected domains
|
180 |
-
if any(
|
181 |
("." + domain) in link["link"] for domain in domains_to_skip
|
182 |
):
|
183 |
continue
|
@@ -217,6 +222,7 @@ def plagiarism_check(
|
|
217 |
month_to,
|
218 |
day_to,
|
219 |
domains_to_skip,
|
|
|
220 |
):
|
221 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
222 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
@@ -227,7 +233,7 @@ def plagiarism_check(
|
|
227 |
|
228 |
url_scores = []
|
229 |
sentence_scores = []
|
230 |
-
sentences = split_sentence_blocks(input)
|
231 |
url_count = {}
|
232 |
score_array = []
|
233 |
url_list = []
|
@@ -308,6 +314,7 @@ def html_highlight(
|
|
308 |
month_to,
|
309 |
day_to,
|
310 |
domains_to_skip,
|
|
|
311 |
):
|
312 |
sentence_scores, url_scores = plagiarism_check(
|
313 |
plag_option,
|
@@ -319,6 +326,7 @@ def html_highlight(
|
|
319 |
month_to,
|
320 |
day_to,
|
321 |
domains_to_skip,
|
|
|
322 |
)
|
323 |
|
324 |
html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
|
@@ -347,5 +355,5 @@ def html_highlight(
|
|
347 |
html_content += formatted_url
|
348 |
|
349 |
html_content += "</div>"
|
350 |
-
|
351 |
-
return html_content
|
|
|
66 |
return float(numerator) / denominator
|
67 |
|
68 |
|
69 |
+
def split_sentence_blocks(text, size):
|
70 |
+
if size == "Paragraph":
|
71 |
+
blocks = text.split("\n")
|
72 |
+
return blocks
|
73 |
+
else:
|
74 |
+
blocks = []
|
75 |
+
size = int(size)
|
76 |
+
for para in text.split("\n\n"):
|
77 |
+
sents = sent_tokenize(para)
|
78 |
+
for i in range(len(sents)):
|
79 |
+
if (i % size) == 0:
|
80 |
+
blocks.append(sents[i])
|
81 |
+
else:
|
82 |
+
blocks[int(i / size)] += " " + sents[i]
|
83 |
+
return blocks
|
84 |
|
85 |
|
86 |
def build_date(year=2024, month="March", day=1):
|
|
|
182 |
if count >= 3:
|
183 |
break
|
184 |
# skip user selected domains
|
185 |
+
if (domains_to_skip is not None) and any(
|
186 |
("." + domain) in link["link"] for domain in domains_to_skip
|
187 |
):
|
188 |
continue
|
|
|
222 |
month_to,
|
223 |
day_to,
|
224 |
domains_to_skip,
|
225 |
+
source_block_size,
|
226 |
):
|
227 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
228 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
|
|
233 |
|
234 |
url_scores = []
|
235 |
sentence_scores = []
|
236 |
+
sentences = split_sentence_blocks(input, source_block_size)
|
237 |
url_count = {}
|
238 |
score_array = []
|
239 |
url_list = []
|
|
|
314 |
month_to,
|
315 |
day_to,
|
316 |
domains_to_skip,
|
317 |
+
source_block_size,
|
318 |
):
|
319 |
sentence_scores, url_scores = plagiarism_check(
|
320 |
plag_option,
|
|
|
326 |
month_to,
|
327 |
day_to,
|
328 |
domains_to_skip,
|
329 |
+
source_block_size,
|
330 |
)
|
331 |
|
332 |
html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
|
|
|
355 |
html_content += formatted_url
|
356 |
|
357 |
html_content += "</div>"
|
358 |
+
|
359 |
+
return html_content
|