Update app.py
Browse files
app.py
CHANGED
@@ -33,17 +33,17 @@ def extract_pdf_text(file) -> str:
|
|
33 |
print("me llamo samyak")
|
34 |
try:
|
35 |
# Open the PDF file
|
36 |
-
|
37 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
38 |
full_text = ""
|
39 |
-
|
40 |
for page_num, page in enumerate(doc, start=1):
|
41 |
text = page.get_text("text")
|
42 |
full_text += text + "\n"
|
43 |
print(f"Extracted text from page {page_num}: {len(text)} characters.")
|
44 |
doc.close()
|
45 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
46 |
-
|
47 |
return full_text
|
48 |
except Exception as e:
|
49 |
print(f"Error extracting text from PDF: {e}")
|
@@ -120,7 +120,8 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
|
|
120 |
"category": match.category,
|
121 |
"rule_id": match.ruleId,
|
122 |
"offset": match.offset,
|
123 |
-
"length": match.errorLength
|
|
|
124 |
})
|
125 |
print(f"Total language issues found: {len(issues)}")
|
126 |
return {
|
@@ -190,85 +191,6 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
|
|
190 |
"is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
|
191 |
}
|
192 |
|
193 |
-
def check_reference_style(full_text: str) -> Dict[str, Any]:
|
194 |
-
"""Check the reference style used in the paper and identify inconsistencies."""
|
195 |
-
reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
|
196 |
-
if not reference_section_match:
|
197 |
-
return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
|
198 |
-
|
199 |
-
references_text = reference_section_match.group(1)
|
200 |
-
reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
|
201 |
-
references = [ref.strip() for ref in reference_list if ref.strip()]
|
202 |
-
|
203 |
-
styles = []
|
204 |
-
inconsistent_refs = []
|
205 |
-
patterns = {
|
206 |
-
"IEEE": r'^\[\d+\]',
|
207 |
-
"Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
|
208 |
-
"APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
|
209 |
-
"MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
|
210 |
-
"Vancouver": r'^\d+\.\s',
|
211 |
-
"Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
|
212 |
-
}
|
213 |
-
|
214 |
-
for i, ref in enumerate(references, 1):
|
215 |
-
matched = False
|
216 |
-
for style, pattern in patterns.items():
|
217 |
-
if re.match(pattern, ref):
|
218 |
-
styles.append(style)
|
219 |
-
matched = True
|
220 |
-
break
|
221 |
-
if not matched:
|
222 |
-
styles.append("Unknown")
|
223 |
-
inconsistent_refs.append((i, ref, "Unknown"))
|
224 |
-
|
225 |
-
if not styles:
|
226 |
-
return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
|
227 |
-
|
228 |
-
style_counts = Counter(styles)
|
229 |
-
majority_style, majority_count = style_counts.most_common(1)[0]
|
230 |
-
|
231 |
-
for i, style in enumerate(styles, 1):
|
232 |
-
if style != majority_style and style != "Unknown":
|
233 |
-
inconsistent_refs.append((i, references[i-1], style))
|
234 |
-
|
235 |
-
consistency = majority_count / len(styles)
|
236 |
-
|
237 |
-
return {
|
238 |
-
"majority_style": majority_style,
|
239 |
-
"inconsistent_refs": inconsistent_refs,
|
240 |
-
"consistency": consistency
|
241 |
-
}
|
242 |
-
|
243 |
-
# ------------------------------
|
244 |
-
# Annotation Functions
|
245 |
-
# ------------------------------
|
246 |
-
|
247 |
-
def highlight_text(page, words, text, annotation):
|
248 |
-
"""Highlight text and add annotation."""
|
249 |
-
text_instances = find_text_instances(words, text)
|
250 |
-
highlighted = False
|
251 |
-
for inst in text_instances:
|
252 |
-
highlight = page.add_highlight_annot(inst)
|
253 |
-
highlight.update()
|
254 |
-
comment = page.add_text_annot(inst[:2], annotation)
|
255 |
-
comment.update()
|
256 |
-
highlighted = True
|
257 |
-
return highlighted
|
258 |
-
|
259 |
-
def find_text_instances(words, text):
|
260 |
-
"""Find all instances of text in words."""
|
261 |
-
text_lower = text.lower()
|
262 |
-
text_words = text_lower.split()
|
263 |
-
instances = []
|
264 |
-
for i in range(len(words) - len(text_words) + 1):
|
265 |
-
if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
|
266 |
-
inst = fitz.Rect(words[i][:4])
|
267 |
-
for j in range(1, len(text_words)):
|
268 |
-
inst = inst | fitz.Rect(words[i+j][:4])
|
269 |
-
instances.append(inst)
|
270 |
-
return instances
|
271 |
-
|
272 |
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
|
273 |
"""
|
274 |
Highlights language issues in the PDF and returns the annotated PDF as bytes.
|
@@ -279,13 +201,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
279 |
# Open the PDF
|
280 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
281 |
print(f"Opened PDF with {len(doc)} pages.")
|
282 |
-
|
283 |
# Extract words with positions from each page
|
284 |
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
|
285 |
for page_number in range(len(doc)):
|
286 |
page = doc[page_number]
|
287 |
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
288 |
for w in words:
|
|
|
289 |
word_text = w[4]
|
290 |
# **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
|
291 |
if '[' in word_text:
|
@@ -318,7 +241,12 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
318 |
if not target_words:
|
319 |
print("No matching words found for this issue.")
|
320 |
continue
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
322 |
# Add highlight annotations to the target words
|
323 |
for target in target_words:
|
324 |
page_num, word_text, x0, y0, x1, y1 = target
|
@@ -330,6 +258,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
330 |
highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
|
331 |
highlight.update()
|
332 |
print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
|
|
|
333 |
|
334 |
# Save annotated PDF to bytes
|
335 |
byte_stream = io.BytesIO()
|
@@ -342,7 +271,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
342 |
f.write(annotated_pdf_bytes)
|
343 |
print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
|
344 |
|
345 |
-
return annotated_pdf_bytes
|
346 |
except Exception as e:
|
347 |
print(f"Error in highlighting PDF: {e}")
|
348 |
return b""
|
@@ -358,12 +287,11 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
|
|
358 |
return {"error": "Failed to extract text from PDF."}, None
|
359 |
|
360 |
language_issues = check_language_issues(full_text)
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
return language_issues, annotated_pdf
|
367 |
except Exception as e:
|
368 |
return {"error": str(e)}, None
|
369 |
|
|
|
33 |
print("me llamo samyak")
|
34 |
try:
|
35 |
# Open the PDF file
|
36 |
+
# print("me llamo samyak")
|
37 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
38 |
full_text = ""
|
39 |
+
# print(doc)
|
40 |
for page_num, page in enumerate(doc, start=1):
|
41 |
text = page.get_text("text")
|
42 |
full_text += text + "\n"
|
43 |
print(f"Extracted text from page {page_num}: {len(text)} characters.")
|
44 |
doc.close()
|
45 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
46 |
+
# print(full_text)
|
47 |
return full_text
|
48 |
except Exception as e:
|
49 |
print(f"Error extracting text from PDF: {e}")
|
|
|
120 |
"category": match.category,
|
121 |
"rule_id": match.ruleId,
|
122 |
"offset": match.offset,
|
123 |
+
"length": match.errorLength,
|
124 |
+
"coordinates":[]
|
125 |
})
|
126 |
print(f"Total language issues found: {len(issues)}")
|
127 |
return {
|
|
|
191 |
"is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
|
192 |
}
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
|
195 |
"""
|
196 |
Highlights language issues in the PDF and returns the annotated PDF as bytes.
|
|
|
201 |
# Open the PDF
|
202 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
203 |
print(f"Opened PDF with {len(doc)} pages.")
|
204 |
+
print(language_matches)
|
205 |
# Extract words with positions from each page
|
206 |
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
|
207 |
for page_number in range(len(doc)):
|
208 |
page = doc[page_number]
|
209 |
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
210 |
for w in words:
|
211 |
+
# print(w)
|
212 |
word_text = w[4]
|
213 |
# **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
|
214 |
if '[' in word_text:
|
|
|
241 |
if not target_words:
|
242 |
print("No matching words found for this issue.")
|
243 |
continue
|
244 |
+
|
245 |
+
initial_x = target_words[0][2]
|
246 |
+
initial_y = target_words[0][3]
|
247 |
+
final_x = target_words[len(target_words)-1][4]
|
248 |
+
final_y = target_words[len(target_words)-1][5]
|
249 |
+
issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
|
250 |
# Add highlight annotations to the target words
|
251 |
for target in target_words:
|
252 |
page_num, word_text, x0, y0, x1, y1 = target
|
|
|
258 |
highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
|
259 |
highlight.update()
|
260 |
print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
|
261 |
+
|
262 |
|
263 |
# Save annotated PDF to bytes
|
264 |
byte_stream = io.BytesIO()
|
|
|
271 |
f.write(annotated_pdf_bytes)
|
272 |
print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
|
273 |
|
274 |
+
return language_matches, annotated_pdf_bytes
|
275 |
except Exception as e:
|
276 |
print(f"Error in highlighting PDF: {e}")
|
277 |
return b""
|
|
|
287 |
return {"error": "Failed to extract text from PDF."}, None
|
288 |
|
289 |
language_issues = check_language_issues(full_text)
|
290 |
+
|
291 |
+
if language_issues:
|
292 |
+
issues = language_issues.get("issues", [])
|
293 |
+
language_issues, annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
|
294 |
+
return language_issues, annotated_pdf
|
|
|
295 |
except Exception as e:
|
296 |
return {"error": str(e)}, None
|
297 |
|