HaiderAUT commited on
Commit
cfc4f6b
Β·
verified Β·
1 Parent(s): 9c6cbca

Update app.py

Browse files

Reverted the UI

Files changed (1) hide show
  1. app.py +29 -40
app.py CHANGED
@@ -17,25 +17,25 @@ from PyPDF2 import PdfReader # plain text extraction
17
  import gradio as gr # UI
18
  from dotenv import load_dotenv # optional .env support
19
 
20
-
21
  # ─────────────────────────────────────────────────────────────────────────────
22
- # 1. PDF & TEXT PROCESSING
23
  # ─────────────────────────────────────────────────────────────────────────────
24
 
25
  def extract_pdf_text(pdf_file) -> str:
26
  """Extracts text from a PDF file using PyPDF2."""
27
  reader = PdfReader(pdf_file)
28
- return "\n".join(p.extract_text() or "" for p in reader.pages)
 
29
 
30
 
31
  def extract_pdf_word(pdf_file) -> str:
32
  """Extracts text from PDF using PyMuPDF (fitz) for better layout preservation."""
33
  doc = fitz.open(pdf_file)
34
- text_blocks = [page.get_text("text") for page in doc]
 
35
  return "\n".join(filter(None, text_blocks))
36
 
37
 
38
- # MODIFIED FUNCTION: Improved logic for re-joining wrapped lines
39
  def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
40
  """Re-join hard-wrapped lines from PDF extraction based on grammatical context."""
41
  merged = []
@@ -46,8 +46,8 @@ def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
46
 
47
  if merged:
48
  prev = merged[-1]
49
- # YOUR INSIGHT: Merge if previous line ends with 'β€”' or lacks closing punctuation,
50
- # and the next line appears to be a continuation (starts with lowercase or parenthesis).
51
  if prev.endswith('β€”') or \
52
  (not re.search(r'[.:;)]\s*$', prev) and re.match(r'^[a-z\(]', ln_stripped)):
53
  merged[-1] = prev + ' ' + ln_stripped
@@ -57,7 +57,7 @@ def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
57
 
58
 
59
  # ─────────────────────────────────────────────────────────────────────────────
60
- # 2. RULE PARSING & CLEANING (Initial Automated Pass)
61
  # ─────────────────────────────────────────────────────────────────────────────
62
 
63
  # --- Regex for rule structure ---
@@ -104,15 +104,15 @@ def clean_line(line: str, source: str) -> str:
104
  line = re.sub(r'\s{2,}', ' ', line)
105
  return line.strip()
106
 
107
- # NEW HELPER: Determines nesting level of a sub-rule, e.g., (1) is level 1, (a) is 2, (i) is 3
108
  def get_rule_level(paren_str):
 
109
  content = paren_str.strip('()').lower()
 
110
  if content.isdigit(): return 1
111
- if all(c in 'ivxl' for c in content): return 3 # roman numerals
112
  if content.isalpha(): return 2 # alphabetical
113
  return 4 # Unknown level, treat as deeply nested
114
 
115
- # REWRITTEN FUNCTION: Stateful, hierarchical parser to correctly handle sub-rules.
116
  def parse_rules(text: str, source: str) -> dict[str, str]:
117
  """
118
  Parses raw text into a dictionary of {rule_id: rule_text}.
@@ -126,7 +126,6 @@ def parse_rules(text: str, source: str) -> dict[str, str]:
126
  """Saves the buffered lines to the current rule ID."""
127
  if parent_parts and lines_buffer:
128
  rule_id = "".join(parent_parts)
129
- # Append to existing text if rule already has a title, otherwise create it
130
  existing_text = rules.get(rule_id, "")
131
  new_text = " ".join(lines_buffer)
132
  rules[rule_id] = (existing_text + " " + new_text).strip()
@@ -141,17 +140,16 @@ def parse_rules(text: str, source: str) -> dict[str, str]:
141
  m_main = rule_pat.match(cleaned)
142
  m_sub = sub_rule_pat.match(cleaned)
143
  m_sp = subpart_pat.match(cleaned)
144
-
145
  if m_sp:
146
  commit_buffer()
147
  parent_parts = [f"subpart-{m_sp.group(1).upper()}"]
148
  rules["".join(parent_parts)] = f"Subpart {m_sp.group(1).upper()} β€” {m_sp.group(2).strip()}"
149
-
150
  elif m_main:
151
  new_base_id = m_main.group('base_rule')
152
  current_base_id = parent_parts[0] if parent_parts and not parent_parts[0].startswith("subpart") else None
153
 
154
- # YOUR INSIGHT: A rule never refers to itself. If it's the same base ID, it's content.
155
  if new_base_id == current_base_id:
156
  lines_buffer.append(cleaned)
157
  continue
@@ -161,14 +159,13 @@ def parse_rules(text: str, source: str) -> dict[str, str]:
161
  title = m_main.group('title').strip()
162
  if title:
163
  rules["".join(parent_parts)] = title
164
-
165
- elif m_sub and parent_parts:
166
  commit_buffer()
167
  paren_part = m_sub.group(1)
168
  text_part = m_sub.group(2).strip()
169
  new_level = get_rule_level(paren_part)
170
 
171
- # Adjust hierarchy: pop parent parts until we are at the correct level
172
  while len(parent_parts) > 1:
173
  last_part = parent_parts[-1]
174
  last_level = get_rule_level(last_part)
@@ -181,7 +178,7 @@ def parse_rules(text: str, source: str) -> dict[str, str]:
181
  if text_part:
182
  lines_buffer.append(text_part)
183
 
184
- else: # It's continuation text
185
  lines_buffer.append(cleaned)
186
 
187
  commit_buffer()
@@ -189,7 +186,7 @@ def parse_rules(text: str, source: str) -> dict[str, str]:
189
 
190
 
191
  # ─────────────────────────────────────────────────────────────────────────────
192
- # 3. COMPARISON & UI LOGIC
193
  # ─────────────────────────────────────────────────────────────────────────────
194
 
195
  def diff_unified(one: str, caa: str) -> str:
@@ -226,14 +223,16 @@ def combined_sort_key(key: str):
226
  else:
227
  return (4, key)
228
 
229
- parts = re.split(r'([()])', key) # Split but keep delimiters
 
230
  parts = [p for p in parts if p]
231
 
232
  for part in parts:
233
- if part.isdigit():
234
- sortable_tuple += ((1, int(part)),)
 
235
  else:
236
- sortable_tuple += ((2, part.lower()),)
237
  return sortable_tuple
238
 
239
 
@@ -264,24 +263,16 @@ def save_clean_and_dirty_versions(dirty_one, dirty_caa, clean_one, clean_caa, fi
264
  return filename
265
 
266
 
267
- # --- STAGE 1: Process PDFs and prepare for user review ---
268
- # MODIFIED FUNCTION: Skips ToC pages before parsing.
269
  def stage1_process_and_review(part, onereg_pdf, caa_pdf):
270
  if not (onereg_pdf and caa_pdf):
271
  raise gr.Error("Please upload both PDF files.")
272
  try:
273
- # --- Process OneReg PDF (skipping ToC) ---
274
- onereg_doc = fitz.open(onereg_pdf.name)
275
- # OneReg ToC for Part 108 is pages 2-4 (index 1-3). Content starts on page 5 (index 4).
276
- onereg_text_blocks = [page.get_text("text") for i, page in enumerate(onereg_doc) if i >= 4]
277
- raw_one = "\n".join(filter(None, onereg_text_blocks))
278
  one_data = parse_rules(raw_one, "onereg")
279
 
280
- # --- Process CAA PDF (skipping ToC) ---
281
- caa_doc = PdfReader(caa_pdf.name)
282
- # CAA 'List of Rules' for Part 108 is page 4 (index 3). Content starts on page 5 (index 4).
283
- caa_text_blocks = [p.extract_text() or "" for i, p in enumerate(caa_doc.pages) if i >= 4]
284
- raw_caa = "\n".join(caa_text_blocks)
285
  caa_data = parse_rules(raw_caa, "caa")
286
 
287
  # Get all rule IDs and sort them
@@ -290,7 +281,6 @@ def stage1_process_and_review(part, onereg_pdf, caa_pdf):
290
  key=combined_sort_key
291
  )
292
 
293
- # Filter for the relevant part, but always include subparts and appendices
294
  rules_to_review = [
295
  r for r in all_ids
296
  if r.startswith(f"{part}.") or r.startswith("subpart-") or re.match(r'^[A-Z]\.', r)
@@ -316,7 +306,6 @@ def stage1_process_and_review(part, onereg_pdf, caa_pdf):
316
  raise gr.Error(f"Failed during initial processing: {e}")
317
 
318
 
319
- # --- STAGE 2: Take user-cleaned text and perform the final comparison ---
320
  def stage2_finalize_and_compare(review_df, original_one, original_caa):
321
  if review_df is None or review_df.empty:
322
  raise gr.Error("No data to compare. Please process the files first.")
@@ -371,7 +360,7 @@ def stage2_finalize_and_compare(review_df, original_one, original_caa):
371
 
372
 
373
  # ───────────────────────────────────────���─────────────────────────────────────
374
- # 4. GRADIO UI LAYOUT
375
  # ─────────────────────────────────────────────────────────────────────────────
376
 
377
  with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
@@ -383,7 +372,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
383
 
384
  # --- Stage 1: Inputs and Initial Processing ---
385
  with gr.Row():
386
- part_num = gr.Textbox(label="Part Number", value="108")
387
  onereg_pdf = gr.File(label="Upload OneReg PDF")
388
  caa_pdf = gr.File(label="Upload CAA PDF")
389
 
 
17
  import gradio as gr # UI
18
  from dotenv import load_dotenv # optional .env support
19
 
 
20
  # ─────────────────────────────────────────────────────────────────────────────
21
+ # 1. PDF & TEXT PROCESSING (LOGIC MODIFIED HERE)
22
  # ─────────────────────────────────────────────────────────────────────────────
23
 
24
  def extract_pdf_text(pdf_file) -> str:
25
  """Extracts text from a PDF file using PyPDF2."""
26
  reader = PdfReader(pdf_file)
27
+ # MODIFICATION: Skips the first 4 pages (ToC/List of Rules)
28
+ return "\n".join(p.extract_text() or "" for i, p in enumerate(reader.pages) if i >= 4)
29
 
30
 
31
  def extract_pdf_word(pdf_file) -> str:
32
  """Extracts text from PDF using PyMuPDF (fitz) for better layout preservation."""
33
  doc = fitz.open(pdf_file)
34
+ # MODIFICATION: Skips the first 4 pages (ToC)
35
+ text_blocks = [page.get_text("text") for i, page in enumerate(doc) if i >= 4]
36
  return "\n".join(filter(None, text_blocks))
37
 
38
 
 
39
  def merge_pdf_wrapped_lines(raw_text: str) -> list[str]:
40
  """Re-join hard-wrapped lines from PDF extraction based on grammatical context."""
41
  merged = []
 
46
 
47
  if merged:
48
  prev = merged[-1]
49
+ # Merge if previous line ends with 'β€”' or lacks closing punctuation,
50
+ # and the next line appears to be a continuation.
51
  if prev.endswith('β€”') or \
52
  (not re.search(r'[.:;)]\s*$', prev) and re.match(r'^[a-z\(]', ln_stripped)):
53
  merged[-1] = prev + ' ' + ln_stripped
 
57
 
58
 
59
  # ─────────────────────────────────────────────────────────────────────────────
60
+ # 2. RULE PARSING & CLEANING (LOGIC MODIFIED HERE)
61
  # ─────────────────────────────────────────────────────────────────────────────
62
 
63
  # --- Regex for rule structure ---
 
104
  line = re.sub(r'\s{2,}', ' ', line)
105
  return line.strip()
106
 
 
107
  def get_rule_level(paren_str):
108
+ """Determines nesting level of a sub-rule, e.g., (1) is 1, (a) is 2, (i) is 3."""
109
  content = paren_str.strip('()').lower()
110
+ if not content: return 99
111
  if content.isdigit(): return 1
112
+ if all(c in 'ivxlmc' for c in content): return 3 # roman numerals
113
  if content.isalpha(): return 2 # alphabetical
114
  return 4 # Unknown level, treat as deeply nested
115
 
 
116
  def parse_rules(text: str, source: str) -> dict[str, str]:
117
  """
118
  Parses raw text into a dictionary of {rule_id: rule_text}.
 
126
  """Saves the buffered lines to the current rule ID."""
127
  if parent_parts and lines_buffer:
128
  rule_id = "".join(parent_parts)
 
129
  existing_text = rules.get(rule_id, "")
130
  new_text = " ".join(lines_buffer)
131
  rules[rule_id] = (existing_text + " " + new_text).strip()
 
140
  m_main = rule_pat.match(cleaned)
141
  m_sub = sub_rule_pat.match(cleaned)
142
  m_sp = subpart_pat.match(cleaned)
143
+
144
  if m_sp:
145
  commit_buffer()
146
  parent_parts = [f"subpart-{m_sp.group(1).upper()}"]
147
  rules["".join(parent_parts)] = f"Subpart {m_sp.group(1).upper()} β€” {m_sp.group(2).strip()}"
148
+
149
  elif m_main:
150
  new_base_id = m_main.group('base_rule')
151
  current_base_id = parent_parts[0] if parent_parts and not parent_parts[0].startswith("subpart") else None
152
 
 
153
  if new_base_id == current_base_id:
154
  lines_buffer.append(cleaned)
155
  continue
 
159
  title = m_main.group('title').strip()
160
  if title:
161
  rules["".join(parent_parts)] = title
162
+
163
+ elif m_sub and parent_parts and not parent_parts[0].startswith("subpart"):
164
  commit_buffer()
165
  paren_part = m_sub.group(1)
166
  text_part = m_sub.group(2).strip()
167
  new_level = get_rule_level(paren_part)
168
 
 
169
  while len(parent_parts) > 1:
170
  last_part = parent_parts[-1]
171
  last_level = get_rule_level(last_part)
 
178
  if text_part:
179
  lines_buffer.append(text_part)
180
 
181
+ else:
182
  lines_buffer.append(cleaned)
183
 
184
  commit_buffer()
 
186
 
187
 
188
  # ─────────────────────────────────────────────────────────────────────────────
189
+ # 3. COMPARISON & UI LOGIC (LOGIC MODIFIED HERE)
190
  # ─────────────────────────────────────────────────────────────────────────────
191
 
192
  def diff_unified(one: str, caa: str) -> str:
 
223
  else:
224
  return (4, key)
225
 
226
+ # MODIFICATION: More robust splitting for hierarchical keys like "108.51(3)(i)"
227
+ parts = re.split(r'(\d+\.\d+)|(\([a-zA-Z0-9]+\))', key)
228
  parts = [p for p in parts if p]
229
 
230
  for part in parts:
231
+ num_match = re.match(r'^\d+\.\d+$', part)
232
+ if num_match:
233
+ sortable_tuple += tuple( (1, int(x)) for x in part.split('.'))
234
  else:
235
+ sortable_tuple += ((2, part.lower()),)
236
  return sortable_tuple
237
 
238
 
 
263
  return filename
264
 
265
 
 
 
266
  def stage1_process_and_review(part, onereg_pdf, caa_pdf):
267
  if not (onereg_pdf and caa_pdf):
268
  raise gr.Error("Please upload both PDF files.")
269
  try:
270
+ # Process OneReg PDF
271
+ raw_one = extract_pdf_word(onereg_pdf.name)
 
 
 
272
  one_data = parse_rules(raw_one, "onereg")
273
 
274
+ # Process CAA PDF
275
+ raw_caa = extract_pdf_text(caa_pdf.name)
 
 
 
276
  caa_data = parse_rules(raw_caa, "caa")
277
 
278
  # Get all rule IDs and sort them
 
281
  key=combined_sort_key
282
  )
283
 
 
284
  rules_to_review = [
285
  r for r in all_ids
286
  if r.startswith(f"{part}.") or r.startswith("subpart-") or re.match(r'^[A-Z]\.', r)
 
306
  raise gr.Error(f"Failed during initial processing: {e}")
307
 
308
 
 
309
  def stage2_finalize_and_compare(review_df, original_one, original_caa):
310
  if review_df is None or review_df.empty:
311
  raise gr.Error("No data to compare. Please process the files first.")
 
360
 
361
 
362
  # ───────────────────────────────────────���─────────────────────────────────────
363
+ # 4. GRADIO UI LAYOUT (UI IS IDENTICAL TO YOUR ORIGINAL SCRIPT)
364
  # ─────────────────────────────────────────────────────────────────────────────
365
 
366
  with gr.Blocks(theme=gr.themes.Soft(), title="Dual Rule Cleaning Tool") as demo:
 
372
 
373
  # --- Stage 1: Inputs and Initial Processing ---
374
  with gr.Row():
375
+ part_num = gr.Textbox(label="Part Number", value="139")
376
  onereg_pdf = gr.File(label="Upload OneReg PDF")
377
  caa_pdf = gr.File(label="Upload CAA PDF")
378