samyak152002 commited on
Commit
0c80b43
·
verified ·
1 Parent(s): 1cac4fd

Create App.py

Browse files
Files changed (1) hide show
  1. App.py +366 -0
App.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import re
4
+ import fitz
5
+ from pdfminer.high_level import extract_text
6
+ from pdfminer.layout import LAParams
7
+ import language_tool_python
8
+ from tqdm import tqdm
9
+ from typing import List, Dict, Any, Tuple
10
+ from collections import Counter
11
+ import json
12
+ import sys
13
+ import traceback
14
+ import io
15
+ import os
16
+
17
+ class PDFAnalyzer:
18
+ def __init__(self, file_path: str):
19
+ self.file_path = file_path
20
+ self.pages_text = self.extract_pdf_text_by_page()
21
+ self.full_text = self.extract_pdf_text()
22
+ self.language_tool = language_tool_python.LanguageTool('en-US')
23
+
24
+ def extract_pdf_text_by_page(self) -> List[str]:
25
+ """Extracts text from a PDF file, page by page, using PyMuPDF."""
26
+ with fitz.open(self.file_path) as doc:
27
+ return [page.get_text("text") for page in doc]
28
+
29
+ def extract_pdf_text(self) -> str:
30
+ """Extracts text from a PDF file using pdfminer."""
31
+ return extract_text(self.file_path, laparams=LAParams())
32
+
33
+ def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
34
+ """Checks for the presence of required terms in the text."""
35
+ return {term: term in self.full_text for term in search_terms}
36
+
37
+ def label_authors(self) -> str:
38
+ """Label authors in the text with 'Authors:' if not already labeled."""
39
+ author_line_regex = r"^(?:.*\n)(.*?)(?:\n\nNetaji Subhas University of Technology, Dwarka, Delhi, 110078, India)"
40
+ match = re.search(author_line_regex, self.full_text, re.MULTILINE)
41
+ if match:
42
+ authors = match.group(1).strip()
43
+ return self.full_text.replace(authors, f"Authors: {authors}")
44
+ return self.full_text
45
+
46
+ def check_metadata(self) -> Dict[str, Any]:
47
+ """Check for metadata elements."""
48
+ return {
49
+ "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
50
+ "list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
51
+ "keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
52
+ "word_count": len(self.full_text.split()) or "Missing"
53
+ }
54
+
55
+ def check_disclosures(self) -> Dict[str, bool]:
56
+ """Check for disclosure statements."""
57
+ search_terms = [
58
+ "author contributions statement",
59
+ "conflict of interest statement",
60
+ "ethics statement",
61
+ "funding statement",
62
+ "data access statement"
63
+ ]
64
+ return self.check_text_presence(search_terms)
65
+
66
+ def check_figures_and_tables(self) -> Dict[str, bool]:
67
+ """Check for figures and tables."""
68
+ return {
69
+ "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
70
+ "figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
71
+ "tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
72
+ }
73
+
74
+ def check_references(self) -> Dict[str, Any]:
75
+ """Check for references."""
76
+ return {
77
+ "old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
78
+ "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
79
+ "reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
80
+ "self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
81
+ }
82
+
83
+ def check_structure(self) -> Dict[str, bool]:
84
+ """Check document structure."""
85
+ return {
86
+ "imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
87
+ "abstract_structure": "structured abstract" in self.full_text.lower()
88
+ }
89
+
90
+ def check_language_issues(self) -> Dict[str, Any]:
91
+ """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
92
+ matches = self.language_tool.check(self.full_text)
93
+ word_count = len(self.full_text.split())
94
+ issues_count = len(matches)
95
+ issues_per_1000 = (issues_count / word_count) * 1000
96
+
97
+ serializable_matches = [
98
+ {
99
+ "message": match.message,
100
+ "replacements": match.replacements,
101
+ "offset": match.offset,
102
+ "errorLength": match.errorLength,
103
+ "category": match.category,
104
+ "ruleIssueType": match.ruleIssueType,
105
+ "sentence": match.sentence
106
+ }
107
+ for match in matches
108
+ ]
109
+
110
+ return {
111
+ "issues_count": issues_count,
112
+ "issues_per_1000": issues_per_1000,
113
+ "failed": issues_per_1000 > 20,
114
+ "matches": serializable_matches
115
+ }
116
+
117
+ def check_language(self) -> Dict[str, Any]:
118
+ """Check language quality."""
119
+ return {
120
+ "plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
121
+ "readability_issues": False, # Placeholder for future implementation
122
+ "language_issues": self.check_language_issues()
123
+ }
124
+
125
+ def check_figure_order(self) -> Dict[str, Any]:
126
+ """Check if figures are referred to in sequential order."""
127
+ figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
128
+ figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
129
+ figure_numbers = sorted(set(int(num) for num in figure_references))
130
+
131
+ is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
132
+
133
+ if figure_numbers:
134
+ expected_figures = set(range(1, max(figure_numbers) + 1))
135
+ missing_figures = list(expected_figures - set(figure_numbers))
136
+ else:
137
+ missing_figures = None
138
+
139
+ duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
140
+ duplicate_numbers = [int(num) for num in duplicates]
141
+ notMentioned = list(set(figure_references) - set(duplicates))
142
+
143
+ return {
144
+ "sequential_order": is_sequential,
145
+ "figure_count": len(figure_numbers),
146
+ "missing_figures": missing_figures,
147
+ "figure_order": figure_numbers,
148
+ "duplicate_references": duplicates,
149
+ "not_mentioned": notMentioned
150
+ }
151
+
152
+ def check_reference_order(self) -> Dict[str, Any]:
153
+ """Check if references in the main body text are in order."""
154
+ reference_pattern = r'\[(\d+)\]'
155
+ references = re.findall(reference_pattern, self.full_text)
156
+ ref_numbers = [int(ref) for ref in references]
157
+
158
+ max_ref = 0
159
+ out_of_order = []
160
+ for i, ref in enumerate(ref_numbers):
161
+ if ref > max_ref + 1:
162
+ out_of_order.append((i+1, ref))
163
+ max_ref = max(max_ref, ref)
164
+
165
+ all_refs = set(range(1, max_ref + 1))
166
+ used_refs = set(ref_numbers)
167
+ missing_refs = list(all_refs - used_refs)
168
+
169
+ return {
170
+ "max_reference": max_ref,
171
+ "out_of_order": out_of_order,
172
+ "missing_references": missing_refs,
173
+ "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
174
+ }
175
+
176
+ def check_reference_style(self) -> Dict[str, Any]:
177
+ """Check the reference style used in the paper and identify inconsistencies."""
178
+ reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
179
+ if not reference_section_match:
180
+ return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
181
+
182
+ references_text = reference_section_match.group(1)
183
+ reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
184
+ references = [ref.strip() for ref in reference_list if ref.strip()]
185
+
186
+ styles = []
187
+ inconsistent_refs = []
188
+ patterns = {
189
+ "IEEE": r'^\[\d+\]',
190
+ "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
191
+ "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
192
+ "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
193
+ "Vancouver": r'^\d+\.\s',
194
+ "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
195
+ }
196
+
197
+ for i, ref in enumerate(references, 1):
198
+ matched = False
199
+ for style, pattern in patterns.items():
200
+ if re.match(pattern, ref):
201
+ styles.append(style)
202
+ matched = True
203
+ break
204
+ if not matched:
205
+ styles.append("Unknown")
206
+ inconsistent_refs.append((i, ref, "Unknown"))
207
+
208
+ if not styles:
209
+ return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
210
+
211
+ style_counts = Counter(styles)
212
+ majority_style, majority_count = style_counts.most_common(1)[0]
213
+
214
+ for i, style in enumerate(styles, 1):
215
+ if style != majority_style and style != "Unknown":
216
+ inconsistent_refs.append((i, references[i-1], style))
217
+
218
+ consistency = majority_count / len(styles)
219
+
220
+ return {
221
+ "majority_style": majority_style,
222
+ "inconsistent_refs": inconsistent_refs,
223
+ "consistency": consistency
224
+ }
225
+
226
+ def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
227
+ """Highlight inconsistent references and add notes for language issues in a single PDF."""
228
+ try:
229
+ doc = fitz.open(self.file_path)
230
+ added_notes = set()
231
+
232
+ for page_number, page in enumerate(doc, start=1):
233
+ words = page.get_text("words")
234
+
235
+ if inconsistent_refs:
236
+ for ref_num, ref_text, ref_style in inconsistent_refs:
237
+ self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")
238
+
239
+ if language_matches:
240
+ for match in language_matches:
241
+ issue_text = match['sentence']
242
+ error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
243
+ issue_key = (issue_text, error_message)
244
+
245
+ if issue_key not in added_notes:
246
+ if self.highlight_text(page, words, issue_text, error_message):
247
+ added_notes.add(issue_key)
248
+
249
+ annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
250
+ doc.save(annotated_file_path)
251
+ doc.close()
252
+
253
+ if os.path.exists(annotated_file_path):
254
+ return annotated_file_path
255
+ else:
256
+ print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
257
+ return ""
258
+
259
+ except Exception as e:
260
+ print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
261
+ traceback.print_exc()
262
+ return ""
263
+
264
+ def highlight_text(self, page, words, text, annotation):
265
+ """Highlight text and add annotation."""
266
+ text_instances = self.find_text_instances(words, text)
267
+ highlighted = False
268
+ for inst in text_instances:
269
+ highlight = page.add_highlight_annot(inst)
270
+ highlight.update()
271
+ comment = page.add_text_annot(inst[:2], annotation)
272
+ comment.update()
273
+ highlighted = True
274
+ return highlighted
275
+
276
+ def find_text_instances(self, words, text):
277
+ """Find all instances of text in words."""
278
+ text_lower = text.lower()
279
+ text_words = text_lower.split()
280
+ instances = []
281
+ for i in range(len(words) - len(text_words) + 1):
282
+ if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
283
+ inst = fitz.Rect(words[i][:4])
284
+ for j in range(1, len(text_words)):
285
+ inst = inst | fitz.Rect(words[i+j][:4])
286
+ instances.append(inst)
287
+ return instances
288
+
289
+ def analyze(self) -> Dict[str, Any]:
290
+ """Perform full analysis of the PDF."""
291
+ self.full_text = self.label_authors()
292
+
293
+ results = {
294
+ "metadata": self.check_metadata(),
295
+ "disclosures": self.check_disclosures(),
296
+ "figures_and_tables": self.check_figures_and_tables(),
297
+ "figure_order": self.check_figure_order(),
298
+ "references": self.check_references(),
299
+ "reference_order": self.check_reference_order(),
300
+ "reference_style": self.check_reference_style(),
301
+ "structure": self.check_structure(),
302
+ "language": self.check_language(),
303
+ "annotated_pdf_path": ""
304
+ }
305
+
306
+ inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
307
+ language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])
308
+
309
+ if inconsistent_refs or language_matches:
310
+ annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
311
+ results["annotated_pdf_path"] = annotated_path
312
+
313
+ return results
314
+
315
+ def analyze_pdf(file):
316
+ try:
317
+ # Save the uploaded file temporarily
318
+ temp_path = "temp_uploaded.pdf"
319
+ with open(temp_path, "wb") as f:
320
+ f.write(file.read())
321
+
322
+ analyzer = PDFAnalyzer(temp_path)
323
+ results = analyzer.analyze()
324
+
325
+ # Ensure all keys are present in the results, even if they're empty
326
+ default_results = {
327
+ "annotated_pdf_path": "",
328
+ "metadata": {},
329
+ "disclosures": {},
330
+ "figures_and_tables": {},
331
+ "figure_order": {},
332
+ "references": {},
333
+ "reference_order": {},
334
+ "reference_style": {},
335
+ "structure": {},
336
+ "language": {},
337
+ }
338
+
339
+ # Update default_results with actual results
340
+ default_results.update(results)
341
+
342
+ return json.dumps(default_results, indent=2, default=str)
343
+
344
+ except Exception as e:
345
+ error_message = {
346
+ "error": str(e),
347
+ "traceback": traceback.format_exc()
348
+ }
349
+ return json.dumps(error_message, indent=2)
350
+ finally:
351
+ # Clean up the temporary file
352
+ if os.path.exists(temp_path):
353
+ os.remove(temp_path)
354
+
355
+ # Create Gradio interface
356
+ iface = gr.Interface(
357
+ fn=analyze_pdf,
358
+ inputs=gr.File(label="Upload PDF"),
359
+ outputs=gr.JSON(label="Analysis Results"),
360
+ title="PDF Analyzer",
361
+ description="Upload a PDF document to analyze its structure, references, language, and more.",
362
+ )
363
+
364
+ # Launch the app
365
+ if __name__ == "__main__":
366
+ iface.launch()