samyak152002 commited on
Commit
b6f0b25
·
verified ·
1 Parent(s): 1fb3760

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -18
app.py CHANGED
@@ -298,30 +298,69 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
298
 
299
  def check_figure_order(plain_text: str) -> Dict[str, Any]:
300
  figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
301
- figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
302
-
303
- valid_figure_numbers_int = []
304
- for num_str in figure_references_str:
 
 
 
305
  if num_str.isdigit():
306
  valid_figure_numbers_int.append(int(num_str))
307
-
308
- unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
309
- is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))
310
 
311
- missing_figures = []
312
- if unique_sorted_figures:
313
- expected_figures = set(range(1, max(unique_sorted_figures) + 1))
314
- missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
315
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  counts = Counter(valid_figure_numbers_int)
317
- duplicate_refs = [num for num, count in counts.items() if count > 1]
318
-
 
 
 
 
 
 
 
 
 
319
  return {
320
- "sequential_order_of_unique_figures": is_sequential,
321
  "figure_count_unique": len(unique_sorted_figures),
322
- "missing_figures_in_sequence_to_max": missing_figures,
323
- "figure_order_as_encountered": valid_figure_numbers_int,
324
- "duplicate_references_to_same_figure_number": duplicate_refs
 
325
  }
326
 
327
  def check_reference_order(plain_text: str) -> Dict[str, Any]:
 
298
 
299
  def check_figure_order(plain_text: str) -> Dict[str, Any]:
300
  figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
301
+ # Find all matches; re.IGNORECASE ensures "figure", "Figure", "FIGURE" are caught
302
+ figure_references_raw = re.findall(figure_pattern, plain_text, re.IGNORECASE)
303
+
304
+ # Convert captured numbers (group 1 of the regex) to integers
305
+ # Only include if the captured string is indeed a digit.
306
+ valid_figure_numbers_int: List[int] = []
307
+ for num_str in figure_references_raw:
308
  if num_str.isdigit():
309
  valid_figure_numbers_int.append(int(num_str))
310
+ # else:
311
+ # Optional: log or handle non-digit captures if the regex could allow them
312
+ # print(f"Warning: Figure regex captured non-digit '{num_str}'")
313
 
314
+ if not valid_figure_numbers_int:
315
+ # No valid figure references found in the text
316
+ return {
317
+ "sequential_order_of_unique_figures": True, # Vacuously true as no figures to be out of order
318
+ "figure_count_unique": 0,
319
+ "missing_figures_in_sequence_to_max": [],
320
+ "figure_order_as_encountered": [],
321
+ "duplicate_references_to_same_figure_number": [],
322
+ "figures_mentioned_only_once": [] # New: No figures, so none are mentioned only once
323
+ }
324
+
325
+ # Get unique figure numbers, sorted
326
+ unique_sorted_figures: List[int] = sorted(list(set(valid_figure_numbers_int)))
327
+
328
+ # Check 1: Are the unique, sorted figures consecutive?
329
+ # e.g., [1, 2, 3] is sequential. [1, 3] is not. [2, 3, 4] is sequential by this definition.
330
+ is_sequential = True # Assume true initially
331
+ if len(unique_sorted_figures) > 1: # Only check if there's more than one unique figure
332
+ is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1]
333
+ for i in range(len(unique_sorted_figures) - 1))
334
+
335
+ # Check 2: Missing figures in the sequence from 1 up to the highest figure number mentioned.
336
+ # This assumes figures should ideally start from 1 and be continuous up to the max.
337
+ missing_figures: List[int] = []
338
+ # max_found_figure will not error as unique_sorted_figures is non-empty at this point
339
+ max_found_figure = unique_sorted_figures[-1] # Since it's sorted and non-empty
340
+ expected_figures_up_to_max = set(range(1, max_found_figure + 1))
341
+ actual_figures_found_set = set(unique_sorted_figures)
342
+ missing_figures = sorted(list(expected_figures_up_to_max - actual_figures_found_set))
343
+
344
+ # Check 3: Count occurrences of each figure reference for duplicates and single mentions
345
  counts = Counter(valid_figure_numbers_int)
346
+
347
+ # Figures mentioned more than once (duplicates in terms of referencing the same figure number)
348
+ duplicate_refs: List[int] = sorted([num for num, count in counts.items() if count > 1])
349
+
350
+ # New Check: Figures mentioned exactly once.
351
+ # The requirement is "each figure should have atleast more than 1 mention".
352
+ # So, if a figure's mention count is 1, it fails this condition.
353
+ figures_mentioned_only_once: List[int] = sorted([
354
+ num for num, count in counts.items() if count == 1
355
+ ])
356
+
357
  return {
358
+ "sequential_order_of_unique_figures": is_sequential,
359
  "figure_count_unique": len(unique_sorted_figures),
360
+ "missing_figures_in_sequence_to_max": missing_figures,
361
+ "figure_order_as_encountered": valid_figure_numbers_int, # Original list of all found figure numbers in order of appearance
362
+ "duplicate_references_to_same_figure_number": duplicate_refs,
363
+ "figures_mentioned_only_once": figures_mentioned_only_once # NEWLY ADDED
364
  }
365
 
366
  def check_reference_order(plain_text: str) -> Dict[str, Any]: