samyak152002 commited on
Commit
0e6dbe2
·
verified ·
1 Parent(s): 3700c3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +401 -346
app.py CHANGED
@@ -1,380 +1,435 @@
1
- import gradio as gr
2
- import PyPDF2
3
  import re
4
- import fitz
5
  from pdfminer.high_level import extract_text
6
  from pdfminer.layout import LAParams
7
  import language_tool_python
8
- from tqdm import tqdm
9
  from typing import List, Dict, Any, Tuple
10
  from collections import Counter
11
  import json
12
- import sys
13
  import traceback
14
  import io
15
- import os
16
  import tempfile
 
 
 
 
 
17
 
18
- class PDFAnalyzer:
19
- def __init__(self, file_path: str):
20
- self.file_path = file_path
21
- self.pages_text = self.extract_pdf_text_by_page()
22
- self.full_text = self.extract_pdf_text()
23
- self.language_tool = language_tool_python.LanguageTool('en-US')
24
 
25
- def extract_pdf_text_by_page(self) -> List[str]:
26
- """Extracts text from a PDF file, page by page, using PyMuPDF."""
27
- with fitz.open(self.file_path) as doc:
 
 
 
 
28
  return [page.get_text("text") for page in doc]
29
 
30
- def extract_pdf_text(self) -> str:
31
- """Extracts text from a PDF file using pdfminer."""
32
- return extract_text(self.file_path, laparams=LAParams())
33
-
34
- def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
35
- """Checks for the presence of required terms in the text."""
36
- return {term: term.lower() in self.full_text.lower() for term in search_terms}
37
-
38
- def label_authors(self) -> str:
39
- """Label authors in the text with 'Authors:' if not already labeled."""
40
- author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
41
- match = re.search(author_line_regex, self.full_text, re.MULTILINE)
42
- if match:
43
- authors = match.group(1).strip()
44
- return self.full_text.replace(authors, f"Authors: {authors}")
45
- return self.full_text
46
-
47
- def check_metadata(self) -> Dict[str, Any]:
48
- """Check for metadata elements."""
49
- return {
50
- "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
51
- "list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
52
- "keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
53
- "word_count": len(self.full_text.split()) or "Missing"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- def check_disclosures(self) -> Dict[str, bool]:
57
- """Check for disclosure statements."""
58
- search_terms = [
59
- "author contributions statement",
60
- "conflict of interest statement",
61
- "ethics statement",
62
- "funding statement",
63
- "data access statement"
64
- ]
65
- return self.check_text_presence(search_terms)
66
-
67
- def check_figures_and_tables(self) -> Dict[str, bool]:
68
- """Check for figures and tables."""
69
- return {
70
- "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
71
- "figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
72
- "tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
73
- }
 
 
 
74
 
75
- def check_references(self) -> Dict[str, Any]:
76
- """Check for references."""
77
- return {
78
- "old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
79
- "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
80
- "reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
81
- "self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
 
84
- def check_structure(self) -> Dict[str, bool]:
85
- """Check document structure."""
86
- return {
87
- "imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
88
- "abstract_structure": "structured abstract" in self.full_text.lower()
 
 
 
 
 
 
 
 
 
 
89
  }
 
 
 
 
 
90
 
91
- def check_language_issues(self) -> Dict[str, Any]:
92
- """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
93
- matches = self.language_tool.check(self.full_text)
94
- word_count = len(self.full_text.split())
95
- issues_count = len(matches)
96
- issues_per_1000 = (issues_count / word_count) * 1000
 
 
 
 
 
 
97
 
98
- serializable_matches = [
99
- {
100
- "message": match.message,
101
- "replacements": match.replacements,
102
- "offset": match.offset,
103
- "errorLength": match.errorLength,
104
- "category": match.category,
105
- "ruleIssueType": match.ruleIssueType,
106
- "sentence": match.sentence
107
- }
108
- for match in matches
109
- ]
110
 
111
- return {
112
- "issues_count": issues_count,
113
- "issues_per_1000": issues_per_1000,
114
- "failed": issues_per_1000 > 20,
115
- "matches": serializable_matches
116
- }
117
-
118
- def check_language(self) -> Dict[str, Any]:
119
- """Check language quality."""
120
- return {
121
- "plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
122
- "readability_issues": False, # Placeholder for future implementation
123
- "language_issues": self.check_language_issues()
124
- }
125
-
126
- def check_figure_order(self) -> Dict[str, Any]:
127
- """Check if figures are referred to in sequential order."""
128
- figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
129
- figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
130
- figure_numbers = sorted(set(int(num) for num in figure_references))
131
 
132
- is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
 
 
 
 
133
 
134
- if figure_numbers:
135
- expected_figures = set(range(1, max(figure_numbers) + 1))
136
- missing_figures = list(expected_figures - set(figure_numbers))
137
- else:
138
- missing_figures = None
139
-
140
- duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
141
- duplicate_numbers = [int(num) for num in duplicates]
142
- notMentioned = list(set(figure_references) - set(duplicates))
143
 
144
- return {
145
- "sequential_order": is_sequential,
146
- "figure_count": len(figure_numbers),
147
- "missing_figures": missing_figures,
148
- "figure_order": figure_numbers,
149
- "duplicate_references": duplicates,
150
- "not_mentioned": notMentioned
151
- }
152
 
153
- def check_reference_order(self) -> Dict[str, Any]:
154
- """Check if references in the main body text are in order."""
155
- reference_pattern = r'\[(\d+)\]'
156
- references = re.findall(reference_pattern, self.full_text)
157
- ref_numbers = [int(ref) for ref in references]
158
 
159
- max_ref = 0
160
- out_of_order = []
161
- for i, ref in enumerate(ref_numbers):
162
- if ref > max_ref + 1:
163
- out_of_order.append((i+1, ref))
164
- max_ref = max(max_ref, ref)
165
 
166
- all_refs = set(range(1, max_ref + 1))
167
- used_refs = set(ref_numbers)
168
- missing_refs = list(all_refs - used_refs)
169
 
170
- return {
171
- "max_reference": max_ref,
172
- "out_of_order": out_of_order,
173
- "missing_references": missing_refs,
174
- "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
175
- }
176
-
177
- def check_reference_style(self) -> Dict[str, Any]:
178
- """Check the reference style used in the paper and identify inconsistencies."""
179
- reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
180
- if not reference_section_match:
181
- return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
182
-
183
- references_text = reference_section_match.group(1)
184
- reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
185
- references = [ref.strip() for ref in reference_list if ref.strip()]
186
-
187
- styles = []
188
- inconsistent_refs = []
189
- patterns = {
190
- "IEEE": r'^\[\d+\]',
191
- "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
192
- "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
193
- "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
194
- "Vancouver": r'^\d+\.\s',
195
- "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
196
- }
197
-
198
- for i, ref in enumerate(references, 1):
199
- matched = False
200
- for style, pattern in patterns.items():
201
- if re.match(pattern, ref):
202
- styles.append(style)
203
- matched = True
204
- break
205
- if not matched:
206
- styles.append("Unknown")
207
- inconsistent_refs.append((i, ref, "Unknown"))
208
-
209
- if not styles:
210
- return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
211
-
212
- style_counts = Counter(styles)
213
- majority_style, majority_count = style_counts.most_common(1)[0]
214
-
215
- for i, style in enumerate(styles, 1):
216
- if style != majority_style and style != "Unknown":
217
- inconsistent_refs.append((i, references[i-1], style))
218
-
219
- consistency = majority_count / len(styles)
220
-
221
- return {
222
- "majority_style": majority_style,
223
- "inconsistent_refs": inconsistent_refs,
224
- "consistency": consistency
225
- }
226
-
227
- def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
228
- """Highlight inconsistent references and add notes for language issues in a single PDF."""
229
- try:
230
- doc = fitz.open(self.file_path)
231
- added_notes = set()
232
-
233
- for page_number, page in enumerate(doc, start=1):
234
- words = page.get_text("words")
235
-
236
- if inconsistent_refs:
237
- for ref_num, ref_text, ref_style in inconsistent_refs:
238
- self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")
239
-
240
- if language_matches:
241
- for match in language_matches:
242
- issue_text = match['sentence']
243
- error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
244
- issue_key = (issue_text, error_message)
245
-
246
- if issue_key not in added_notes:
247
- if self.highlight_text(page, words, issue_text, error_message):
248
- added_notes.add(issue_key)
249
-
250
- annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
251
- doc.save(annotated_file_path)
252
- doc.close()
253
-
254
- if os.path.exists(annotated_file_path):
255
- return annotated_file_path
256
- else:
257
- print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
258
- return ""
259
-
260
- except Exception as e:
261
- print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
262
- traceback.print_exc()
263
- return ""
264
-
265
- def highlight_text(self, page, words, text, annotation):
266
- """Highlight text and add annotation."""
267
- text_instances = self.find_text_instances(words, text)
268
- highlighted = False
269
- for inst in text_instances:
270
- highlight = page.add_highlight_annot(inst)
271
- highlight.update()
272
- comment = page.add_text_annot(inst[:2], annotation)
273
- comment.update()
274
- highlighted = True
275
- return highlighted
276
-
277
- def find_text_instances(self, words, text):
278
- """Find all instances of text in words."""
279
- text_lower = text.lower()
280
- text_words = text_lower.split()
281
- instances = []
282
- for i in range(len(words) - len(text_words) + 1):
283
- if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
284
- inst = fitz.Rect(words[i][:4])
285
- for j in range(1, len(text_words)):
286
- inst = inst | fitz.Rect(words[i+j][:4])
287
- instances.append(inst)
288
- return instances
289
-
290
- def analyze(self) -> Dict[str, Any]:
291
- """Perform full analysis of the PDF."""
292
- self.full_text = self.label_authors()
293
 
294
- results = {
295
- "metadata": self.check_metadata(),
296
- "disclosures": self.check_disclosures(),
297
- "figures_and_tables": self.check_figures_and_tables(),
298
- "figure_order": self.check_figure_order(),
299
- "references": self.check_references(),
300
- "reference_order": self.check_reference_order(),
301
- "reference_style": self.check_reference_style(),
302
- "structure": self.check_structure(),
303
- "language": self.check_language(),
304
- "annotated_pdf_path": ""
305
- }
306
 
307
- inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
308
- language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])
309
-
310
- if inconsistent_refs or language_matches:
311
- annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
312
- results["annotated_pdf_path"] = annotated_path
313
-
314
- return results
315
-
316
- def analyze_pdf(file):
317
- try:
318
- # Create a temporary directory to store files
319
- with tempfile.TemporaryDirectory() as temp_dir:
320
- # Save the uploaded file temporarily
321
- temp_path = os.path.join(temp_dir, "uploaded.pdf")
322
- with open(temp_path, "wb") as f:
323
- f.write(file.read())
324
-
325
- analyzer = PDFAnalyzer(temp_path)
326
- results = analyzer.analyze()
327
-
328
- # Ensure all keys are present in the results, even if they're empty
329
- default_results = {
330
- "annotated_pdf_path": "",
331
- "metadata": {},
332
- "disclosures": {},
333
- "figures_and_tables": {},
334
- "figure_order": {},
335
- "references": {},
336
- "reference_order": {},
337
- "reference_style": {},
338
- "structure": {},
339
- "language": {},
340
- }
341
-
342
- # Update default_results with actual results
343
- default_results.update(results)
344
-
345
- # Handle the annotated PDF
346
- annotated_pdf_path = results.get("annotated_pdf_path", "")
347
- if annotated_pdf_path and os.path.exists(annotated_pdf_path):
348
- # Read the annotated PDF and return it as bytes
349
- with open(annotated_pdf_path, "rb") as f:
350
- annotated_pdf_bytes = f.read()
351
- else:
352
- annotated_pdf_bytes = None
353
-
354
- # Remove the annotated_pdf_path from the results as we're returning the file separately
355
- default_results.pop("annotated_pdf_path", None)
356
-
357
- return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
358
 
359
- except Exception as e:
360
- error_message = {
361
- "error": str(e),
362
- "traceback": traceback.format_exc()
363
- }
364
- return json.dumps(error_message, indent=2), None
365
-
366
- # Create Gradio interface
367
- iface = gr.Interface(
368
- fn=analyze_pdf,
369
- inputs=gr.File(label="Upload PDF"),
370
- outputs=[
371
- gr.JSON(label="Analysis Results"),
372
- gr.File(label="Annotated PDF")
373
- ],
374
- title="PDF Analyzer",
375
- description="Upload a PDF document to analyze its structure, references, language, and more.",
376
- )
377
-
378
- # Launch the app
379
  if __name__ == "__main__":
380
- iface.launch()
 
 
 
 
 
 
 
 
1
  import re
2
+ import fitz # PyMuPDF
3
  from pdfminer.high_level import extract_text
4
  from pdfminer.layout import LAParams
5
  import language_tool_python
 
6
  from typing import List, Dict, Any, Tuple
7
  from collections import Counter
8
  import json
 
9
  import traceback
10
  import io
 
11
  import tempfile
12
+ import os
13
+ import gradio as gr
14
+
15
+ # Set JAVA_HOME environment variable
16
+ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
17
 
18
+ # ------------------------------
19
+ # Analysis Functions
20
+ # ------------------------------
 
 
 
21
 
22
+ def extract_pdf_text_by_page(file) -> List[str]:
23
+ """Extracts text from a PDF file, page by page, using PyMuPDF."""
24
+ if isinstance(file, str):
25
+ with fitz.open(file) as doc:
26
+ return [page.get_text("text") for page in doc]
27
+ else:
28
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
29
  return [page.get_text("text") for page in doc]
30
 
31
+ def extract_pdf_text(file) -> str:
32
+ """Extracts text from a PDF file using pdfminer."""
33
+ if isinstance(file, str):
34
+ with open(file, 'rb') as f:
35
+ return extract_text(f, laparams=LAParams())
36
+ else:
37
+ return extract_text(file, laparams=LAParams())
38
+
39
+ def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
40
+ """Checks for the presence of required terms in the text."""
41
+ return {term: term.lower() in full_text.lower() for term in search_terms}
42
+
43
+ def label_authors(full_text: str) -> str:
44
+ """Label authors in the text with 'Authors:' if not already labeled."""
45
+ author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
46
+ match = re.search(author_line_regex, full_text, re.MULTILINE)
47
+ if match:
48
+ authors = match.group(1).strip()
49
+ return full_text.replace(authors, f"Authors: {authors}")
50
+ return full_text
51
+
52
+ def check_metadata(full_text: str) -> Dict[str, Any]:
53
+ """Check for metadata elements."""
54
+ return {
55
+ "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
56
+ "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
57
+ "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
58
+ "word_count": len(full_text.split()) or "Missing"
59
+ }
60
+
61
+ def check_disclosures(full_text: str) -> Dict[str, bool]:
62
+ """Check for disclosure statements."""
63
+ search_terms = [
64
+ "author contributions statement",
65
+ "conflict of interest statement",
66
+ "ethics statement",
67
+ "funding statement",
68
+ "data access statement"
69
+ ]
70
+ return check_text_presence(full_text, search_terms)
71
+
72
+ def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
73
+ """Check for figures and tables."""
74
+ return {
75
+ "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
76
+ "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
77
+ "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
78
+ }
79
+
80
+ def check_references(full_text: str) -> Dict[str, Any]:
81
+ """Check for references."""
82
+ return {
83
+ "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
84
+ "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
85
+ "reference_count": len(re.findall(r'\[.*?\]', full_text)),
86
+ "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
87
+ }
88
+
89
+ def check_structure(full_text: str) -> Dict[str, bool]:
90
+ """Check document structure."""
91
+ return {
92
+ "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
93
+ "abstract_structure": "structured abstract" in full_text.lower()
94
+ }
95
+
96
+ def check_language_issues(full_text: str) -> Dict[str, Any]:
97
+ """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
98
+ language_tool = language_tool_python.LanguageTool('en-US')
99
+ matches = language_tool.check(full_text)
100
+ word_count = len(full_text.split())
101
+ issues_count = len(matches)
102
+ issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
103
+
104
+ serializable_matches = [
105
+ {
106
+ "message": match.message,
107
+ "replacements": match.replacements,
108
+ "offset": match.offset,
109
+ "errorLength": match.errorLength,
110
+ "category": match.category,
111
+ "ruleIssueType": match.ruleIssueType,
112
+ "sentence": match.sentence
113
  }
114
+ for match in matches
115
+ ]
116
+
117
+ return {
118
+ "issues_count": issues_count,
119
+ "issues_per_1000": issues_per_1000,
120
+ "failed": issues_per_1000 > 20,
121
+ "matches": serializable_matches
122
+ }
123
+
124
+ def check_language(full_text: str) -> Dict[str, Any]:
125
+ """Check language quality."""
126
+ return {
127
+ "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
128
+ "readability_issues": False, # Placeholder for future implementation
129
+ "language_issues": check_language_issues(full_text)
130
+ }
131
+
132
+ def check_figure_order(full_text: str) -> Dict[str, Any]:
133
+ """Check if figures are referred to in sequential order."""
134
+ figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
135
+ figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
136
+ figure_numbers = sorted(set(int(num) for num in figure_references))
137
+
138
+ is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
139
+
140
+ if figure_numbers:
141
+ expected_figures = set(range(1, max(figure_numbers) + 1))
142
+ missing_figures = list(expected_figures - set(figure_numbers))
143
+ else:
144
+ missing_figures = None
145
+
146
+ duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
147
+ duplicate_numbers = [int(num) for num in duplicates]
148
+ not_mentioned = list(set(figure_references) - set(duplicates))
149
+
150
+ return {
151
+ "sequential_order": is_sequential,
152
+ "figure_count": len(figure_numbers),
153
+ "missing_figures": missing_figures,
154
+ "figure_order": figure_numbers,
155
+ "duplicate_references": duplicates,
156
+ "not_mentioned": not_mentioned
157
+ }
158
+
159
+ def check_reference_order(full_text: str) -> Dict[str, Any]:
160
+ """Check if references in the main body text are in order."""
161
+ reference_pattern = r'\[(\d+)\]'
162
+ references = re.findall(reference_pattern, full_text)
163
+ ref_numbers = [int(ref) for ref in references]
164
+
165
+ max_ref = 0
166
+ out_of_order = []
167
+ for i, ref in enumerate(ref_numbers):
168
+ if ref > max_ref + 1:
169
+ out_of_order.append((i+1, ref))
170
+ max_ref = max(max_ref, ref)
171
+
172
+ all_refs = set(range(1, max_ref + 1))
173
+ used_refs = set(ref_numbers)
174
+ missing_refs = list(all_refs - used_refs)
175
+
176
+ return {
177
+ "max_reference": max_ref,
178
+ "out_of_order": out_of_order,
179
+ "missing_references": missing_refs,
180
+ "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
181
+ }
182
+
183
+ def check_reference_style(full_text: str) -> Dict[str, Any]:
184
+ """Check the reference style used in the paper and identify inconsistencies."""
185
+ reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
186
+ if not reference_section_match:
187
+ return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
188
+
189
+ references_text = reference_section_match.group(1)
190
+ reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
191
+ references = [ref.strip() for ref in reference_list if ref.strip()]
192
+
193
+ styles = []
194
+ inconsistent_refs = []
195
+ patterns = {
196
+ "IEEE": r'^\[\d+\]',
197
+ "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
198
+ "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
199
+ "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
200
+ "Vancouver": r'^\d+\.\s',
201
+ "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
202
+ }
203
+
204
+ for i, ref in enumerate(references, 1):
205
+ matched = False
206
+ for style, pattern in patterns.items():
207
+ if re.match(pattern, ref):
208
+ styles.append(style)
209
+ matched = True
210
+ break
211
+ if not matched:
212
+ styles.append("Unknown")
213
+ inconsistent_refs.append((i, ref, "Unknown"))
214
+
215
+ if not styles:
216
+ return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
217
+
218
+ style_counts = Counter(styles)
219
+ majority_style, majority_count = style_counts.most_common(1)[0]
220
+
221
+ for i, style in enumerate(styles, 1):
222
+ if style != majority_style and style != "Unknown":
223
+ inconsistent_refs.append((i, references[i-1], style))
224
+
225
+ consistency = majority_count / len(styles)
226
+
227
+ return {
228
+ "majority_style": majority_style,
229
+ "inconsistent_refs": inconsistent_refs,
230
+ "consistency": consistency
231
+ }
232
+
233
+ # ------------------------------
234
+ # Annotation Functions
235
+ # ------------------------------
236
+
237
+ def highlight_text(page, words, text, annotation):
238
+ """Highlight text and add annotation."""
239
+ text_instances = find_text_instances(words, text)
240
+ highlighted = False
241
+ for inst in text_instances:
242
+ highlight = page.add_highlight_annot(inst)
243
+ highlight.update()
244
+ comment = page.add_text_annot(inst[:2], annotation)
245
+ comment.update()
246
+ highlighted = True
247
+ return highlighted
248
+
249
+ def find_text_instances(words, text):
250
+ """Find all instances of text in words."""
251
+ text_lower = text.lower()
252
+ text_words = text_lower.split()
253
+ instances = []
254
+ for i in range(len(words) - len(text_words) + 1):
255
+ if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
256
+ inst = fitz.Rect(words[i][:4])
257
+ for j in range(1, len(text_words)):
258
+ inst = inst | fitz.Rect(words[i+j][:4])
259
+ instances.append(inst)
260
+ return instances
261
+
262
+ def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
263
+ """Highlight inconsistent references and add notes for language issues in a single PDF."""
264
+ try:
265
+ if isinstance(file, str):
266
+ doc = fitz.open(file)
267
+ else:
268
+ doc = fitz.open(stream=file.read(), filetype="pdf")
269
+
270
+ added_notes = set()
271
 
272
+ for page_number, page in enumerate(doc, start=1):
273
+ words = page.get_text("words")
274
+
275
+ if inconsistent_refs:
276
+ for ref_num, ref_text, ref_style in inconsistent_refs:
277
+ annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
278
+ highlight_text(page, words, ref_text, annotation_text)
279
+
280
+ if language_matches:
281
+ for match in language_matches:
282
+ issue_text = match['sentence']
283
+ error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
284
+ issue_key = (issue_text, error_message)
285
+
286
+ if issue_key not in added_notes:
287
+ if highlight_text(page, words, issue_text, error_message):
288
+ added_notes.add(issue_key)
289
+
290
+ annotated_pdf_bytes = doc.write()
291
+ doc.close()
292
+ return annotated_pdf_bytes
293
 
294
+ except Exception as e:
295
+ print(f"An error occurred while annotating the PDF: {str(e)}")
296
+ traceback.print_exc()
297
+ return b""
298
+
299
+ # ------------------------------
300
+ # Main Analysis Function
301
+ # ------------------------------
302
+
303
+ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
304
+ """
305
+ Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
306
+ """
307
+ try:
308
+ pages_text = extract_pdf_text_by_page(file)
309
+ full_text = extract_pdf_text(file)
310
+ full_text = label_authors(full_text)
311
+
312
+ # Perform analyses
313
+ metadata = check_metadata(full_text)
314
+ disclosures = check_disclosures(full_text)
315
+ figures_and_tables = check_figures_and_tables(full_text)
316
+ figure_order = check_figure_order(full_text)
317
+ references = check_references(full_text)
318
+ reference_order = check_reference_order(full_text)
319
+ reference_style = check_reference_style(full_text)
320
+ structure = check_structure(full_text)
321
+ language = check_language(full_text)
322
+
323
+ # Compile results
324
+ results = {
325
+ "metadata": metadata,
326
+ "disclosures": disclosures,
327
+ "figures_and_tables": figures_and_tables,
328
+ "figure_order": figure_order,
329
+ "references": references,
330
+ "reference_order": reference_order,
331
+ "reference_style": reference_style,
332
+ "structure": structure,
333
+ "language": language
334
  }
335
 
336
+ # Handle annotations
337
+ inconsistent_refs = reference_style.get("inconsistent_refs", [])
338
+ language_matches = language.get("language_issues", {}).get("matches", [])
339
+
340
+ if inconsistent_refs or language_matches:
341
+ annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
342
+ else:
343
+ annotated_pdf_bytes = None
344
+
345
+ return results, annotated_pdf_bytes
346
+
347
+ except Exception as e:
348
+ error_message = {
349
+ "error": str(e),
350
+ "traceback": traceback.format_exc()
351
  }
352
+ return error_message, None
353
+
354
+ # ------------------------------
355
+ # Gradio Interface
356
+ # ------------------------------
357
 
358
+ def process_upload(file):
359
+ """
360
+ Process the uploaded PDF file and return analysis results and annotated PDF.
361
+ """
362
+ try:
363
+ if file is None:
364
+ return json.dumps({"error": "No file uploaded"}, indent=2), None
365
+
366
+ # Create a temporary file to work with
367
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
368
+ temp_input.write(file)
369
+ temp_input_path = temp_input.name
370
 
371
+ # Analyze the PDF
372
+ results, annotated_pdf = analyze_pdf(temp_input_path)
373
+ results_json = json.dumps(results, indent=2)
 
 
 
 
 
 
 
 
 
374
 
375
+ # Clean up the temporary input file
376
+ os.unlink(temp_input_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ # If we have an annotated PDF, save it temporarily
379
+ if annotated_pdf:
380
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
381
+ tmp_file.write(annotated_pdf)
382
+ return results_json, tmp_file.name
383
 
384
+ return results_json, None
 
 
 
 
 
 
 
 
385
 
386
+ except Exception as e:
387
+ error_message = json.dumps({
388
+ "error": str(e),
389
+ "traceback": traceback.format_exc()
390
+ }, indent=2)
391
+ return error_message, None
392
+
 
393
 
394
+ def create_interface():
395
+ with gr.Blocks(title="PDF Analyzer") as interface:
396
+ gr.Markdown("# PDF Analyzer")
397
+ gr.Markdown("Upload a PDF document to analyze its structure, references, language, and more.")
 
398
 
399
+ with gr.Row():
400
+ file_input = gr.File(
401
+ label="Upload PDF",
402
+ file_types=[".pdf"],
403
+ type="binary" # Changed from "file" to "binary"
404
+ )
405
 
406
+ with gr.Row():
407
+ analyze_btn = gr.Button("Analyze PDF")
 
408
 
409
+ with gr.Row():
410
+ results_output = gr.JSON(
411
+ label="Analysis Results",
412
+ show_label=True
413
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
+ with gr.Row():
416
+ pdf_output = gr.File(
417
+ label="Annotated PDF",
418
+ show_label=True
419
+ )
 
 
 
 
 
 
 
420
 
421
+ analyze_btn.click(
422
+ fn=process_upload,
423
+ inputs=[file_input],
424
+ outputs=[results_output, pdf_output]
425
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
+ return interface
428
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  if __name__ == "__main__":
430
+ interface = create_interface()
431
+ interface.launch(
432
+ share=True, # Set to False in production
433
+ # server_name="0.0.0.0",
434
+ server_port=None
435
+ )