samyak152002 commited on
Commit
feab938
·
verified ·
1 Parent(s): 567c35c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -0
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import fitz # PyMuPDF
4
+ from pdfminer.high_level import extract_text
5
+ from pdfminer.layout import LAParams
6
+ import language_tool_python
7
+ from typing import List, Dict, Any, Tuple
8
+ from collections import Counter
9
+ import json
10
+ import traceback
11
+ import io
12
+ import tempfile
13
+ import os
14
+
15
+ # ------------------------------
16
+ # Analysis Functions
17
+ # ------------------------------
18
+
19
+ def extract_pdf_text_by_page(file) -> List[str]:
20
+ """Extracts text from a PDF file, page by page, using PyMuPDF."""
21
+ file.seek(0)
22
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
23
+ return [page.get_text("text") for page in doc]
24
+
25
+ def extract_pdf_text(file) -> str:
26
+ """Extracts text from a PDF file using pdfminer."""
27
+ file.seek(0)
28
+ return extract_text(file, laparams=LAParams())
29
+
30
+ def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
31
+ """Checks for the presence of required terms in the text."""
32
+ return {term: term.lower() in full_text.lower() for term in search_terms}
33
+
34
+ def label_authors(full_text: str) -> str:
35
+ """Label authors in the text with 'Authors:' if not already labeled."""
36
+ author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
37
+ match = re.search(author_line_regex, full_text, re.MULTILINE)
38
+ if match:
39
+ authors = match.group(1).strip()
40
+ return full_text.replace(authors, f"Authors: {authors}")
41
+ return full_text
42
+
43
+ def check_metadata(full_text: str) -> Dict[str, Any]:
44
+ """Check for metadata elements."""
45
+ return {
46
+ "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
47
+ "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
48
+ "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
49
+ "word_count": len(full_text.split()) or "Missing"
50
+ }
51
+
52
+ def check_disclosures(full_text: str) -> Dict[str, bool]:
53
+ """Check for disclosure statements."""
54
+ search_terms = [
55
+ "author contributions statement",
56
+ "conflict of interest statement",
57
+ "ethics statement",
58
+ "funding statement",
59
+ "data access statement"
60
+ ]
61
+ return check_text_presence(full_text, search_terms)
62
+
63
+ def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
64
+ """Check for figures and tables."""
65
+ return {
66
+ "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
67
+ "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
68
+ "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
69
+ }
70
+
71
+ def check_references(full_text: str) -> Dict[str, Any]:
72
+ """Check for references."""
73
+ return {
74
+ "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
75
+ "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
76
+ "reference_count": len(re.findall(r'\[.*?\]', full_text)),
77
+ "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
78
+ }
79
+
80
+ def check_structure(full_text: str) -> Dict[str, bool]:
81
+ """Check document structure."""
82
+ return {
83
+ "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
84
+ "abstract_structure": "structured abstract" in full_text.lower()
85
+ }
86
+
87
+ def check_language_issues(full_text: str) -> Dict[str, Any]:
88
+ """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
89
+ language_tool = language_tool_python.LanguageTool('en-US')
90
+ matches = language_tool.check(full_text)
91
+ word_count = len(full_text.split())
92
+ issues_count = len(matches)
93
+ issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
94
+
95
+ serializable_matches = [
96
+ {
97
+ "message": match.message,
98
+ "replacements": match.replacements,
99
+ "offset": match.offset,
100
+ "errorLength": match.errorLength,
101
+ "category": match.category,
102
+ "ruleIssueType": match.ruleIssueType,
103
+ "sentence": match.sentence
104
+ }
105
+ for match in matches
106
+ ]
107
+
108
+ return {
109
+ "issues_count": issues_count,
110
+ "issues_per_1000": issues_per_1000,
111
+ "failed": issues_per_1000 > 20,
112
+ "matches": serializable_matches
113
+ }
114
+
115
+ def check_language(full_text: str) -> Dict[str, Any]:
116
+ """Check language quality."""
117
+ return {
118
+ "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
119
+ "readability_issues": False, # Placeholder for future implementation
120
+ "language_issues": check_language_issues(full_text)
121
+ }
122
+
123
+ def check_figure_order(full_text: str) -> Dict[str, Any]:
124
+ """Check if figures are referred to in sequential order."""
125
+ figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
126
+ figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
127
+ figure_numbers = sorted(set(int(num) for num in figure_references))
128
+
129
+ is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
130
+
131
+ if figure_numbers:
132
+ expected_figures = set(range(1, max(figure_numbers) + 1))
133
+ missing_figures = list(expected_figures - set(figure_numbers))
134
+ else:
135
+ missing_figures = None
136
+
137
+ duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
138
+ duplicate_numbers = [int(num) for num in duplicates]
139
+ not_mentioned = list(set(figure_references) - set(duplicates))
140
+
141
+ return {
142
+ "sequential_order": is_sequential,
143
+ "figure_count": len(figure_numbers),
144
+ "missing_figures": missing_figures,
145
+ "figure_order": figure_numbers,
146
+ "duplicate_references": duplicates,
147
+ "not_mentioned": not_mentioned
148
+ }
149
+
150
+ def check_reference_order(full_text: str) -> Dict[str, Any]:
151
+ """Check if references in the main body text are in order."""
152
+ reference_pattern = r'\[(\d+)\]'
153
+ references = re.findall(reference_pattern, full_text)
154
+ ref_numbers = [int(ref) for ref in references]
155
+
156
+ max_ref = 0
157
+ out_of_order = []
158
+ for i, ref in enumerate(ref_numbers):
159
+ if ref > max_ref + 1:
160
+ out_of_order.append((i+1, ref))
161
+ max_ref = max(max_ref, ref)
162
+
163
+ all_refs = set(range(1, max_ref + 1))
164
+ used_refs = set(ref_numbers)
165
+ missing_refs = list(all_refs - used_refs)
166
+
167
+ return {
168
+ "max_reference": max_ref,
169
+ "out_of_order": out_of_order,
170
+ "missing_references": missing_refs,
171
+ "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
172
+ }
173
+
174
+ def check_reference_style(full_text: str) -> Dict[str, Any]:
175
+ """Check the reference style used in the paper and identify inconsistencies."""
176
+ reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
177
+ if not reference_section_match:
178
+ return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
179
+
180
+ references_text = reference_section_match.group(1)
181
+ reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
182
+ references = [ref.strip() for ref in reference_list if ref.strip()]
183
+
184
+ styles = []
185
+ inconsistent_refs = []
186
+ patterns = {
187
+ "IEEE": r'^\[\d+\]',
188
+ "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
189
+ "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
190
+ "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
191
+ "Vancouver": r'^\d+\.\s',
192
+ "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
193
+ }
194
+
195
+ for i, ref in enumerate(references, 1):
196
+ matched = False
197
+ for style, pattern in patterns.items():
198
+ if re.match(pattern, ref):
199
+ styles.append(style)
200
+ matched = True
201
+ break
202
+ if not matched:
203
+ styles.append("Unknown")
204
+ inconsistent_refs.append((i, ref, "Unknown"))
205
+
206
+ if not styles:
207
+ return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
208
+
209
+ style_counts = Counter(styles)
210
+ majority_style, majority_count = style_counts.most_common(1)[0]
211
+
212
+ for i, style in enumerate(styles, 1):
213
+ if style != majority_style and style != "Unknown":
214
+ inconsistent_refs.append((i, references[i-1], style))
215
+
216
+ consistency = majority_count / len(styles)
217
+
218
+ return {
219
+ "majority_style": majority_style,
220
+ "inconsistent_refs": inconsistent_refs,
221
+ "consistency": consistency
222
+ }
223
+
224
+ # ------------------------------
225
+ # Annotation Functions
226
+ # ------------------------------
227
+
228
+ def highlight_text(page, words, text, annotation):
229
+ """Highlight text and add annotation."""
230
+ text_instances = find_text_instances(words, text)
231
+ highlighted = False
232
+ for inst in text_instances:
233
+ highlight = page.add_highlight_annot(inst)
234
+ highlight.update()
235
+ comment = page.add_text_annot(inst[:2], annotation)
236
+ comment.update()
237
+ highlighted = True
238
+ return highlighted
239
+
240
+ def find_text_instances(words, text):
241
+ """Find all instances of text in words."""
242
+ text_lower = text.lower()
243
+ text_words = text_lower.split()
244
+ instances = []
245
+ for i in range(len(words) - len(text_words) + 1):
246
+ if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
247
+ inst = fitz.Rect(words[i][:4])
248
+ for j in range(1, len(text_words)):
249
+ inst = inst | fitz.Rect(words[i+j][:4])
250
+ instances.append(inst)
251
+ return instances
252
+
253
+ def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
254
+ """Highlight inconsistent references and add notes for language issues in a single PDF."""
255
+ try:
256
+ file.seek(0)
257
+ doc = fitz.open(stream=file.read(), filetype="pdf")
258
+ added_notes = set()
259
+
260
+ for page_number, page in enumerate(doc, start=1):
261
+ words = page.get_text("words")
262
+
263
+ if inconsistent_refs:
264
+ for ref_num, ref_text, ref_style in inconsistent_refs:
265
+ annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
266
+ highlight_text(page, words, ref_text, annotation_text)
267
+
268
+ if language_matches:
269
+ for match in language_matches:
270
+ issue_text = match['sentence']
271
+ error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
272
+ issue_key = (issue_text, error_message)
273
+
274
+ if issue_key not in added_notes:
275
+ if highlight_text(page, words, issue_text, error_message):
276
+ added_notes.add(issue_key)
277
+
278
+ annotated_pdf_bytes = doc.write()
279
+ doc.close()
280
+ return annotated_pdf_bytes
281
+
282
+ except Exception as e:
283
+ print(f"An error occurred while annotating the PDF: {str(e)}")
284
+ traceback.print_exc()
285
+ return b""
286
+
287
+ # ------------------------------
288
+ # Main Analysis Function
289
+ # ------------------------------
290
+
291
+ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
292
+ """
293
+ Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
294
+
295
+ Returns:
296
+ Tuple containing:
297
+ - Analysis results as a dictionary.
298
+ - Annotated PDF as bytes.
299
+ """
300
+ try:
301
+ # The 'file' is a BytesIO object provided by Streamlit
302
+ file.seek(0)
303
+ pages_text = extract_pdf_text_by_page(file)
304
+ full_text = extract_pdf_text(file)
305
+ full_text = label_authors(full_text)
306
+
307
+ # Perform analyses
308
+ metadata = check_metadata(full_text)
309
+ disclosures = check_disclosures(full_text)
310
+ figures_and_tables = check_figures_and_tables(full_text)
311
+ figure_order = check_figure_order(full_text)
312
+ references = check_references(full_text)
313
+ reference_order = check_reference_order(full_text)
314
+ reference_style = check_reference_style(full_text)
315
+ structure = check_structure(full_text)
316
+ language = check_language(full_text)
317
+
318
+ # Compile results
319
+ results = {
320
+ "metadata": metadata,
321
+ "disclosures": disclosures,
322
+ "figures_and_tables": figures_and_tables,
323
+ "figure_order": figure_order,
324
+ "references": references,
325
+ "reference_order": reference_order,
326
+ "reference_style": reference_style,
327
+ "structure": structure,
328
+ "language": language
329
+ }
330
+
331
+ # Handle annotations
332
+ inconsistent_refs = reference_style.get("inconsistent_refs", [])
333
+ language_matches = language.get("language_issues", {}).get("matches", [])
334
+
335
+ if inconsistent_refs or language_matches:
336
+ annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
337
+ else:
338
+ annotated_pdf_bytes = None
339
+
340
+ return results, annotated_pdf_bytes
341
+
342
+ except Exception as e:
343
+ error_message = {
344
+ "error": str(e),
345
+ "traceback": traceback.format_exc()
346
+ }
347
+ return error_message, None
348
+
349
+ # ------------------------------
350
+ # Streamlit Interface
351
+ # ------------------------------
352
+
353
+ def main():
354
+ st.title("PDF Analyzer")
355
+ st.write("Upload a PDF document to analyze its structure, references, language, and more.")
356
+
357
+ uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
358
+
359
+ if uploaded_file is not None:
360
+ with st.spinner("Analyzing PDF..."):
361
+ results, annotated_pdf = analyze_pdf(uploaded_file)
362
+
363
+ st.subheader("Analysis Results")
364
+ st.json(results)
365
+
366
+ if annotated_pdf:
367
+ st.subheader("Download Annotated PDF")
368
+ st.download_button(
369
+ label="Download Annotated PDF",
370
+ data=annotated_pdf,
371
+ file_name="annotated.pdf",
372
+ mime="application/pdf"
373
+ )
374
+ else:
375
+ st.success("No issues found. No annotated PDF to download.")
376
+
377
+ if __name__ == "__main__":
378
+ main()