samyak152002 commited on
Commit
a0e200f
·
verified ·
1 Parent(s): 91e3e31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -7
app.py CHANGED
@@ -117,11 +117,13 @@ def check_structure(full_text: str) -> Dict[str, bool]:
117
  }
118
 
119
  def check_language_issues(full_text: str) -> Dict[str, Any]:
120
- """Check for language issues using LanguageTool."""
121
  try:
122
  language_tool = language_tool_python.LanguageTool('en-US')
123
  matches = language_tool.check(full_text)
124
  issues = []
 
 
125
  for match in matches:
126
  issues.append({
127
  "message": match.message,
@@ -131,10 +133,40 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
131
  "rule_id": match.ruleId,
132
  "offset": match.offset,
133
  "length": match.errorLength,
134
- "coordinates":[],
135
- "page":0
136
  })
137
  print(f"Total language issues found: {len(issues)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return {
139
  "total_issues": len(issues),
140
  "issues": issues
@@ -252,7 +284,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
252
  if not target_words:
253
  print("No matching words found for this issue.")
254
  continue
255
-
256
  initial_x = target_words[0][2]
257
  initial_y = target_words[0][3]
258
  final_x = target_words[len(target_words)-1][4]
@@ -270,7 +302,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
270
  highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
271
  highlight.update()
272
  print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
273
-
274
 
275
  # Save annotated PDF to bytes
276
  byte_stream = io.BytesIO()
@@ -287,6 +319,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
287
  except Exception as e:
288
  print(f"Error in highlighting PDF: {e}")
289
  return b""
 
290
  # ------------------------------
291
  # Main Analysis Function
292
  # ------------------------------
@@ -341,7 +374,7 @@ def process_upload(file):
341
  temp_input.write(file)
342
  temp_input_path = temp_input.name
343
  print(temp_input_path)
344
- # Analyze -inputthe PDF
345
 
346
  results, annotated_pdf = analyze_pdf(temp_input_path)
347
 
@@ -366,7 +399,7 @@ def process_upload(file):
366
  # }, indent=2)
367
  # return error_message, None
368
 
369
-
370
  def create_interface():
371
  with gr.Blocks(title="PDF Analyzer") as interface:
372
  gr.Markdown("# PDF Analyzer")
 
117
  }
118
 
119
  def check_language_issues(full_text: str) -> Dict[str, Any]:
120
+ """Check for language issues using LanguageTool and additional regex patterns."""
121
  try:
122
  language_tool = language_tool_python.LanguageTool('en-US')
123
  matches = language_tool.check(full_text)
124
  issues = []
125
+
126
+ # Process LanguageTool matches
127
  for match in matches:
128
  issues.append({
129
  "message": match.message,
 
133
  "rule_id": match.ruleId,
134
  "offset": match.offset,
135
  "length": match.errorLength,
136
+ "coordinates": [],
137
+ "page": 0
138
  })
139
  print(f"Total language issues found: {len(issues)}")
140
+
141
+ # -----------------------------------
142
+ # Additions: Regex-based Issue Detection
143
+ # -----------------------------------
144
+
145
+ # Define regex pattern to find words immediately followed by '[' without space
146
+ regex_pattern = r'\b(\w+)\[(\d+)\]'
147
+ regex_matches = list(re.finditer(regex_pattern, full_text))
148
+ print(f"Total regex issues found: {len(regex_matches)}")
149
+
150
+ # Process regex matches
151
+ for match in regex_matches:
152
+ word = match.group(1)
153
+ number = match.group(2)
154
+ start = match.start()
155
+ end = match.end()
156
+ issues.append({
157
+ "message": f"Missing space before '[' in '{word}[{number}]'. Should be '{word} [{number}]'.",
158
+ "context": full_text[max(match.start() - 30, 0):min(match.end() + 30, len(full_text))].strip(),
159
+ "suggestions": [f"{word} [{number}]", f"{word} [`{number}`]", f"{word} [number {number}]"],
160
+ "category": "Formatting",
161
+ "rule_id": "SPACE_BEFORE_BRACKET",
162
+ "offset": match.start(),
163
+ "length": match.end() - match.start(),
164
+ "coordinates": [],
165
+ "page": 0
166
+ })
167
+
168
+ print(f"Total combined issues found: {len(issues)}")
169
+
170
  return {
171
  "total_issues": len(issues),
172
  "issues": issues
 
284
  if not target_words:
285
  print("No matching words found for this issue.")
286
  continue
287
+
288
  initial_x = target_words[0][2]
289
  initial_y = target_words[0][3]
290
  final_x = target_words[len(target_words)-1][4]
 
302
  highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
303
  highlight.update()
304
  print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
305
+
306
 
307
  # Save annotated PDF to bytes
308
  byte_stream = io.BytesIO()
 
319
  except Exception as e:
320
  print(f"Error in highlighting PDF: {e}")
321
  return b""
322
+
323
  # ------------------------------
324
  # Main Analysis Function
325
  # ------------------------------
 
374
  temp_input.write(file)
375
  temp_input_path = temp_input.name
376
  print(temp_input_path)
377
+ # Analyze the PDF
378
 
379
  results, annotated_pdf = analyze_pdf(temp_input_path)
380
 
 
399
  # }, indent=2)
400
  # return error_message, None
401
 
402
+
403
  def create_interface():
404
  with gr.Blocks(title="PDF Analyzer") as interface:
405
  gr.Markdown("# PDF Analyzer")