Update app.py
Browse files
app.py
CHANGED
@@ -298,30 +298,69 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
298 |
|
299 |
def check_figure_order(plain_text: str) -> Dict[str, Any]:
|
300 |
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
305 |
if num_str.isdigit():
|
306 |
valid_figure_numbers_int.append(int(num_str))
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
counts = Counter(valid_figure_numbers_int)
|
317 |
-
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
return {
|
320 |
-
"sequential_order_of_unique_figures": is_sequential,
|
321 |
"figure_count_unique": len(unique_sorted_figures),
|
322 |
-
"missing_figures_in_sequence_to_max": missing_figures,
|
323 |
-
"figure_order_as_encountered": valid_figure_numbers_int,
|
324 |
-
"duplicate_references_to_same_figure_number": duplicate_refs
|
|
|
325 |
}
|
326 |
|
327 |
def check_reference_order(plain_text: str) -> Dict[str, Any]:
|
|
|
298 |
|
299 |
def check_figure_order(plain_text: str) -> Dict[str, Any]:
|
300 |
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
|
301 |
+
# Find all matches; re.IGNORECASE ensures "figure", "Figure", "FIGURE" are caught
|
302 |
+
figure_references_raw = re.findall(figure_pattern, plain_text, re.IGNORECASE)
|
303 |
+
|
304 |
+
# Convert captured numbers (group 1 of the regex) to integers
|
305 |
+
# Only include if the captured string is indeed a digit.
|
306 |
+
valid_figure_numbers_int: List[int] = []
|
307 |
+
for num_str in figure_references_raw:
|
308 |
if num_str.isdigit():
|
309 |
valid_figure_numbers_int.append(int(num_str))
|
310 |
+
# else:
|
311 |
+
# Optional: log or handle non-digit captures if the regex could allow them
|
312 |
+
# print(f"Warning: Figure regex captured non-digit '{num_str}'")
|
313 |
|
314 |
+
if not valid_figure_numbers_int:
|
315 |
+
# No valid figure references found in the text
|
316 |
+
return {
|
317 |
+
"sequential_order_of_unique_figures": True, # Vacuously true as no figures to be out of order
|
318 |
+
"figure_count_unique": 0,
|
319 |
+
"missing_figures_in_sequence_to_max": [],
|
320 |
+
"figure_order_as_encountered": [],
|
321 |
+
"duplicate_references_to_same_figure_number": [],
|
322 |
+
"figures_mentioned_only_once": [] # New: No figures, so none are mentioned only once
|
323 |
+
}
|
324 |
+
|
325 |
+
# Get unique figure numbers, sorted
|
326 |
+
unique_sorted_figures: List[int] = sorted(list(set(valid_figure_numbers_int)))
|
327 |
+
|
328 |
+
# Check 1: Are the unique, sorted figures consecutive?
|
329 |
+
# e.g., [1, 2, 3] is sequential. [1, 3] is not. [2, 3, 4] is sequential by this definition.
|
330 |
+
is_sequential = True # Assume true initially
|
331 |
+
if len(unique_sorted_figures) > 1: # Only check if there's more than one unique figure
|
332 |
+
is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1]
|
333 |
+
for i in range(len(unique_sorted_figures) - 1))
|
334 |
+
|
335 |
+
# Check 2: Missing figures in the sequence from 1 up to the highest figure number mentioned.
|
336 |
+
# This assumes figures should ideally start from 1 and be continuous up to the max.
|
337 |
+
missing_figures: List[int] = []
|
338 |
+
# max_found_figure will not error as unique_sorted_figures is non-empty at this point
|
339 |
+
max_found_figure = unique_sorted_figures[-1] # Since it's sorted and non-empty
|
340 |
+
expected_figures_up_to_max = set(range(1, max_found_figure + 1))
|
341 |
+
actual_figures_found_set = set(unique_sorted_figures)
|
342 |
+
missing_figures = sorted(list(expected_figures_up_to_max - actual_figures_found_set))
|
343 |
+
|
344 |
+
# Check 3: Count occurrences of each figure reference for duplicates and single mentions
|
345 |
counts = Counter(valid_figure_numbers_int)
|
346 |
+
|
347 |
+
# Figures mentioned more than once (duplicates in terms of referencing the same figure number)
|
348 |
+
duplicate_refs: List[int] = sorted([num for num, count in counts.items() if count > 1])
|
349 |
+
|
350 |
+
# New Check: Figures mentioned exactly once.
|
351 |
+
# The requirement is "each figure should have atleast more than 1 mention".
|
352 |
+
# So, if a figure's mention count is 1, it fails this condition.
|
353 |
+
figures_mentioned_only_once: List[int] = sorted([
|
354 |
+
num for num, count in counts.items() if count == 1
|
355 |
+
])
|
356 |
+
|
357 |
return {
|
358 |
+
"sequential_order_of_unique_figures": is_sequential,
|
359 |
"figure_count_unique": len(unique_sorted_figures),
|
360 |
+
"missing_figures_in_sequence_to_max": missing_figures,
|
361 |
+
"figure_order_as_encountered": valid_figure_numbers_int, # Original list of all found figure numbers in order of appearance
|
362 |
+
"duplicate_references_to_same_figure_number": duplicate_refs,
|
363 |
+
"figures_mentioned_only_once": figures_mentioned_only_once # NEWLY ADDED
|
364 |
}
|
365 |
|
366 |
def check_reference_order(plain_text: str) -> Dict[str, Any]:
|