samyak152002 commited on
Commit
6a6e3b4
·
verified ·
1 Parent(s): 4dd18db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -92
app.py CHANGED
@@ -33,17 +33,17 @@ def extract_pdf_text(file) -> str:
33
  print("me llamo samyak")
34
  try:
35
  # Open the PDF file
36
- print("me llamo samyak")
37
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
38
  full_text = ""
39
- print(doc)
40
  for page_num, page in enumerate(doc, start=1):
41
  text = page.get_text("text")
42
  full_text += text + "\n"
43
  print(f"Extracted text from page {page_num}: {len(text)} characters.")
44
  doc.close()
45
  print(f"Total extracted text length: {len(full_text)} characters.")
46
- print(full_text)
47
  return full_text
48
  except Exception as e:
49
  print(f"Error extracting text from PDF: {e}")
@@ -120,7 +120,8 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
120
  "category": match.category,
121
  "rule_id": match.ruleId,
122
  "offset": match.offset,
123
- "length": match.errorLength
 
124
  })
125
  print(f"Total language issues found: {len(issues)}")
126
  return {
@@ -190,85 +191,6 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
190
  "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
191
  }
192
 
193
- def check_reference_style(full_text: str) -> Dict[str, Any]:
194
- """Check the reference style used in the paper and identify inconsistencies."""
195
- reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
196
- if not reference_section_match:
197
- return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
198
-
199
- references_text = reference_section_match.group(1)
200
- reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
201
- references = [ref.strip() for ref in reference_list if ref.strip()]
202
-
203
- styles = []
204
- inconsistent_refs = []
205
- patterns = {
206
- "IEEE": r'^\[\d+\]',
207
- "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
208
- "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
209
- "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
210
- "Vancouver": r'^\d+\.\s',
211
- "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
212
- }
213
-
214
- for i, ref in enumerate(references, 1):
215
- matched = False
216
- for style, pattern in patterns.items():
217
- if re.match(pattern, ref):
218
- styles.append(style)
219
- matched = True
220
- break
221
- if not matched:
222
- styles.append("Unknown")
223
- inconsistent_refs.append((i, ref, "Unknown"))
224
-
225
- if not styles:
226
- return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
227
-
228
- style_counts = Counter(styles)
229
- majority_style, majority_count = style_counts.most_common(1)[0]
230
-
231
- for i, style in enumerate(styles, 1):
232
- if style != majority_style and style != "Unknown":
233
- inconsistent_refs.append((i, references[i-1], style))
234
-
235
- consistency = majority_count / len(styles)
236
-
237
- return {
238
- "majority_style": majority_style,
239
- "inconsistent_refs": inconsistent_refs,
240
- "consistency": consistency
241
- }
242
-
243
- # ------------------------------
244
- # Annotation Functions
245
- # ------------------------------
246
-
247
- def highlight_text(page, words, text, annotation):
248
- """Highlight text and add annotation."""
249
- text_instances = find_text_instances(words, text)
250
- highlighted = False
251
- for inst in text_instances:
252
- highlight = page.add_highlight_annot(inst)
253
- highlight.update()
254
- comment = page.add_text_annot(inst[:2], annotation)
255
- comment.update()
256
- highlighted = True
257
- return highlighted
258
-
259
- def find_text_instances(words, text):
260
- """Find all instances of text in words."""
261
- text_lower = text.lower()
262
- text_words = text_lower.split()
263
- instances = []
264
- for i in range(len(words) - len(text_words) + 1):
265
- if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
266
- inst = fitz.Rect(words[i][:4])
267
- for j in range(1, len(text_words)):
268
- inst = inst | fitz.Rect(words[i+j][:4])
269
- instances.append(inst)
270
- return instances
271
-
272
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
273
  """
274
  Highlights language issues in the PDF and returns the annotated PDF as bytes.
@@ -279,13 +201,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
279
  # Open the PDF
280
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
281
  print(f"Opened PDF with {len(doc)} pages.")
282
-
283
  # Extract words with positions from each page
284
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
285
  for page_number in range(len(doc)):
286
  page = doc[page_number]
287
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
288
  for w in words:
 
289
  word_text = w[4]
290
  # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
291
  if '[' in word_text:
@@ -318,7 +241,12 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
318
  if not target_words:
319
  print("No matching words found for this issue.")
320
  continue
321
-
 
 
 
 
 
322
  # Add highlight annotations to the target words
323
  for target in target_words:
324
  page_num, word_text, x0, y0, x1, y1 = target
@@ -330,6 +258,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
330
  highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
331
  highlight.update()
332
  print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
 
333
 
334
  # Save annotated PDF to bytes
335
  byte_stream = io.BytesIO()
@@ -342,7 +271,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
342
  f.write(annotated_pdf_bytes)
343
  print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
344
 
345
- return annotated_pdf_bytes
346
  except Exception as e:
347
  print(f"Error in highlighting PDF: {e}")
348
  return b""
@@ -358,12 +287,11 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
358
  return {"error": "Failed to extract text from PDF."}, None
359
 
360
  language_issues = check_language_issues(full_text)
361
- if "error" in language_issues:
362
- return language_issues, None
363
-
364
- issues = language_issues.get("issues", [])
365
- annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
366
- return language_issues, annotated_pdf
367
  except Exception as e:
368
  return {"error": str(e)}, None
369
 
 
33
  print("me llamo samyak")
34
  try:
35
  # Open the PDF file
36
+ # print("me llamo samyak")
37
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
38
  full_text = ""
39
+ # print(doc)
40
  for page_num, page in enumerate(doc, start=1):
41
  text = page.get_text("text")
42
  full_text += text + "\n"
43
  print(f"Extracted text from page {page_num}: {len(text)} characters.")
44
  doc.close()
45
  print(f"Total extracted text length: {len(full_text)} characters.")
46
+ # print(full_text)
47
  return full_text
48
  except Exception as e:
49
  print(f"Error extracting text from PDF: {e}")
 
120
  "category": match.category,
121
  "rule_id": match.ruleId,
122
  "offset": match.offset,
123
+ "length": match.errorLength,
124
+ "coordinates":[]
125
  })
126
  print(f"Total language issues found: {len(issues)}")
127
  return {
 
191
  "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
192
  }
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
195
  """
196
  Highlights language issues in the PDF and returns the annotated PDF as bytes.
 
201
  # Open the PDF
202
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
203
  print(f"Opened PDF with {len(doc)} pages.")
204
+ print(language_matches)
205
  # Extract words with positions from each page
206
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
207
  for page_number in range(len(doc)):
208
  page = doc[page_number]
209
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
210
  for w in words:
211
+ # print(w)
212
  word_text = w[4]
213
  # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
214
  if '[' in word_text:
 
241
  if not target_words:
242
  print("No matching words found for this issue.")
243
  continue
244
+
245
+ initial_x = target_words[0][2]
246
+ initial_y = target_words[0][3]
247
+ final_x = target_words[len(target_words)-1][4]
248
+ final_y = target_words[len(target_words)-1][5]
249
+ issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
250
  # Add highlight annotations to the target words
251
  for target in target_words:
252
  page_num, word_text, x0, y0, x1, y1 = target
 
258
  highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
259
  highlight.update()
260
  print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
261
+
262
 
263
  # Save annotated PDF to bytes
264
  byte_stream = io.BytesIO()
 
271
  f.write(annotated_pdf_bytes)
272
  print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
273
 
274
+ return language_matches, annotated_pdf_bytes
275
  except Exception as e:
276
  print(f"Error in highlighting PDF: {e}")
277
  return b""
 
287
  return {"error": "Failed to extract text from PDF."}, None
288
 
289
  language_issues = check_language_issues(full_text)
290
+
291
+ if language_issues:
292
+ issues = language_issues.get("issues", [])
293
+ language_issues, annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
294
+ return language_issues, annotated_pdf
 
295
  except Exception as e:
296
  return {"error": str(e)}, None
297