samyak152002 commited on
Commit
52dcb43
·
verified ·
1 Parent(s): 3410c51

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -459
app.py CHANGED
@@ -4,7 +4,7 @@ import fitz # PyMuPDF
4
  from pdfminer.high_level import extract_text
5
  from pdfminer.layout import LAParams
6
  import language_tool_python
7
- from typing import List, Dict, Any, Tuple, Optional
8
  from collections import Counter
9
  import json
10
  import traceback
@@ -12,32 +12,10 @@ import io
12
  import tempfile
13
  import os
14
  import base64
15
- from dataclasses import dataclass
16
 
17
  # Set JAVA_HOME environment variable
18
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
19
 
20
- # ------------------------------
21
- # Data Classes
22
- # ------------------------------
23
-
24
- @dataclass
25
- class Highlight:
26
- page: int
27
- rect: Tuple[float, float, float, float]
28
- color: str
29
- message: str
30
- category: str
31
-
32
- @dataclass
33
- class AnalysisResult:
34
- highlights: List[Highlight]
35
- messages: List[Dict[str, Any]]
36
- summary: Dict[str, Any]
37
-
38
- # ------------------------------
39
- # PDF Processing Functions
40
- # ------------------------------
41
  # ------------------------------
42
  # Analysis Functions
43
  # ------------------------------
@@ -63,451 +41,145 @@ def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bo
63
  """Checks for the presence of required terms in the text."""
64
  return {term: term.lower() in full_text.lower() for term in search_terms}
65
 
66
- def label_authors(full_text: str) -> str:
67
- """Label authors in the text with 'Authors:' if not already labeled."""
68
- author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
69
- match = re.search(author_line_regex, full_text, re.MULTILINE)
70
- if match:
71
- authors = match.group(1).strip()
72
- return full_text.replace(authors, f"Authors: {authors}")
73
- return full_text
74
-
75
  def check_metadata(full_text: str) -> Dict[str, Any]:
76
  """Check for metadata elements."""
77
  return {
78
  "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
79
  "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
80
  "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
81
- "word_count": len(full_text.split()) or "Missing"
82
- }
83
-
84
- def check_disclosures(full_text: str) -> Dict[str, bool]:
85
- """Check for disclosure statements."""
86
- search_terms = [
87
- "author contributions statement",
88
- "conflict of interest statement",
89
- "ethics statement",
90
- "funding statement",
91
- "data access statement"
92
- ]
93
- return check_text_presence(full_text, search_terms)
94
-
95
- def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
96
- """Check for figures and tables."""
97
- return {
98
- "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
99
- "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
100
- "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
101
- }
102
-
103
- def check_references(full_text: str) -> Dict[str, Any]:
104
- """Check for references."""
105
- return {
106
- "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
107
- "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
108
- "reference_count": len(re.findall(r'\[.*?\]', full_text)),
109
- "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
110
- }
111
-
112
- def check_structure(full_text: str) -> Dict[str, bool]:
113
- """Check document structure."""
114
- return {
115
- "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
116
- "abstract_structure": "structured abstract" in full_text.lower()
117
  }
118
 
119
  def check_language_issues(full_text: str) -> Dict[str, Any]:
120
- """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
121
- language_tool = language_tool_python.LanguageTool('en-US')
122
- matches = language_tool.check(full_text)
123
- word_count = len(full_text.split())
124
- issues_count = len(matches)
125
- issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
126
-
127
- serializable_matches = [
128
- {
129
- "message": match.message,
130
- "replacements": match.replacements,
131
- "offset": match.offset,
132
- "errorLength": match.errorLength,
133
- "category": match.category,
134
- "ruleIssueType": match.ruleIssueType,
135
- "sentence": match.sentence
136
- }
137
- for match in matches
138
- ]
139
-
140
- return {
141
- "issues_count": issues_count,
142
- "issues_per_1000": issues_per_1000,
143
- "failed": issues_per_1000 > 20,
144
- "matches": serializable_matches
145
- }
146
-
147
- def check_language(full_text: str) -> Dict[str, Any]:
148
- """Check language quality."""
149
- return {
150
- "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
151
- "readability_issues": False, # Placeholder for future implementation
152
- "language_issues": check_language_issues(full_text)
153
- }
154
-
155
- def check_figure_order(full_text: str) -> Dict[str, Any]:
156
- """Check if figures are referred to in sequential order."""
157
- figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
158
- figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
159
- figure_numbers = sorted(set(int(num) for num in figure_references))
160
-
161
- is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
162
-
163
- if figure_numbers:
164
- expected_figures = set(range(1, max(figure_numbers) + 1))
165
- missing_figures = list(expected_figures - set(figure_numbers))
166
- else:
167
- missing_figures = None
168
-
169
- duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
170
- duplicate_numbers = [int(num) for num in duplicates]
171
- not_mentioned = list(set(figure_references) - set(duplicates))
172
-
173
- return {
174
- "sequential_order": is_sequential,
175
- "figure_count": len(figure_numbers),
176
- "missing_figures": missing_figures,
177
- "figure_order": figure_numbers,
178
- "duplicate_references": duplicates,
179
- "not_mentioned": not_mentioned
180
- }
181
-
182
- def check_reference_order(full_text: str) -> Dict[str, Any]:
183
- """Check if references in the main body text are in order."""
184
- reference_pattern = r'\[(\d+)\]'
185
- references = re.findall(reference_pattern, full_text)
186
- ref_numbers = [int(ref) for ref in references]
187
-
188
- max_ref = 0
189
- out_of_order = []
190
- for i, ref in enumerate(ref_numbers):
191
- if ref > max_ref + 1:
192
- out_of_order.append((i+1, ref))
193
- max_ref = max(max_ref, ref)
194
-
195
- all_refs = set(range(1, max_ref + 1))
196
- used_refs = set(ref_numbers)
197
- missing_refs = list(all_refs - used_refs)
198
-
199
- return {
200
- "max_reference": max_ref,
201
- "out_of_order": out_of_order,
202
- "missing_references": missing_refs,
203
- "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
204
- }
205
-
206
- def check_reference_style(full_text: str) -> Dict[str, Any]:
207
- """Check the reference style used in the paper and identify inconsistencies."""
208
- reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
209
- if not reference_section_match:
210
- return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
211
-
212
- references_text = reference_section_match.group(1)
213
- reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
214
- references = [ref.strip() for ref in reference_list if ref.strip()]
215
-
216
- styles = []
217
- inconsistent_refs = []
218
- patterns = {
219
- "IEEE": r'^\[\d+\]',
220
- "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
221
- "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
222
- "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
223
- "Vancouver": r'^\d+\.\s',
224
- "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
225
- }
226
-
227
- for i, ref in enumerate(references, 1):
228
- matched = False
229
- for style, pattern in patterns.items():
230
- if re.match(pattern, ref):
231
- styles.append(style)
232
- matched = True
233
- break
234
- if not matched:
235
- styles.append("Unknown")
236
- inconsistent_refs.append((i, ref, "Unknown"))
237
-
238
- if not styles:
239
- return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
240
-
241
- style_counts = Counter(styles)
242
- majority_style, majority_count = style_counts.most_common(1)[0]
243
-
244
- for i, style in enumerate(styles, 1):
245
- if style != majority_style and style != "Unknown":
246
- inconsistent_refs.append((i, references[i-1], style))
247
-
248
- consistency = majority_count / len(styles)
249
-
250
- return {
251
- "majority_style": majority_style,
252
- "inconsistent_refs": inconsistent_refs,
253
- "consistency": consistency
254
- }
255
-
256
- # ------------------------------
257
- # Annotation Functions
258
- # ------------------------------
259
-
260
- def highlight_text(page, words, text, annotation):
261
- """Highlight text and add annotation."""
262
- text_instances = find_text_instances(words, text)
263
- highlighted = False
264
- for inst in text_instances:
265
- highlight = page.add_highlight_annot(inst)
266
- highlight.update()
267
- comment = page.add_text_annot(inst[:2], annotation)
268
- comment.update()
269
- highlighted = True
270
- return highlighted
271
-
272
- def find_text_instances(words, text):
273
- """Find all instances of text in words."""
274
- text_lower = text.lower()
275
- text_words = text_lower.split()
276
- instances = []
277
- for i in range(len(words) - len(text_words) + 1):
278
- if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
279
- inst = fitz.Rect(words[i][:4])
280
- for j in range(1, len(text_words)):
281
- inst = inst | fitz.Rect(words[i+j][:4])
282
- instances.append(inst)
283
- return instances
284
-
285
- def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
286
- """Highlight inconsistent references and add notes for language issues in a single PDF."""
287
  try:
288
- if isinstance(file, str):
289
- doc = fitz.open(file)
290
- else:
291
- doc = fitz.open(stream=file.read(), filetype="pdf")
292
-
293
- added_notes = set()
294
-
295
- for page_number, page in enumerate(doc, start=1):
296
- words = page.get_text("words")
297
-
298
- if inconsistent_refs:
299
- for ref_num, ref_text, ref_style in inconsistent_refs:
300
- annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
301
- highlight_text(page, words, ref_text, annotation_text)
302
-
303
- if language_matches:
304
- for match in language_matches:
305
- issue_text = match['sentence']
306
- error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
307
- issue_key = (issue_text, error_message)
308
-
309
- if issue_key not in added_notes:
310
- if highlight_text(page, words, issue_text, error_message):
311
- added_notes.add(issue_key)
312
-
313
- annotated_pdf_bytes = doc.write()
314
- doc.close()
315
- return annotated_pdf_bytes
316
-
317
  except Exception as e:
318
- print(f"An error occurred while annotating the PDF: {str(e)}")
319
- traceback.print_exc()
320
- return b""
321
-
322
- # ------------------------------
323
- # Main Analysis Function
324
- # ------------------------------
325
 
326
- def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
327
- """
328
- Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
329
- """
330
  try:
331
- pages_text = extract_pdf_text_by_page(file)
332
  full_text = extract_pdf_text(file)
333
- full_text = label_authors(full_text)
334
-
335
- # Perform analyses
336
- metadata = check_metadata(full_text)
337
- disclosures = check_disclosures(full_text)
338
- figures_and_tables = check_figures_and_tables(full_text)
339
- figure_order = check_figure_order(full_text)
340
- references = check_references(full_text)
341
- reference_order = check_reference_order(full_text)
342
- reference_style = check_reference_style(full_text)
343
- structure = check_structure(full_text)
344
- language = check_language(full_text)
345
-
346
- # Compile results
347
  results = {
348
- "metadata": metadata,
349
- "disclosures": disclosures,
350
- "figures_and_tables": figures_and_tables,
351
- "figure_order": figure_order,
352
- "references": references,
353
- "reference_order": reference_order,
354
- "reference_style": reference_style,
355
- "structure": structure,
356
- "language": language
357
  }
358
-
359
- # Handle annotations
360
- inconsistent_refs = reference_style.get("inconsistent_refs", [])
361
- language_matches = language.get("language_issues", {}).get("matches", [])
362
-
363
- if inconsistent_refs or language_matches:
364
- annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
365
- else:
366
- annotated_pdf_bytes = None
367
-
368
- return results, annotated_pdf_bytes
369
-
370
  except Exception as e:
371
- error_message = {
372
- "error": str(e),
373
- "traceback": traceback.format_exc()
374
- }
375
- return error_message, None
376
-
377
 
378
  # ------------------------------
379
- # Highlight Processing Functions
380
  # ------------------------------
381
 
382
- def get_word_coordinates(doc: fitz.Document) -> Dict[int, List[Dict[str, Any]]]:
383
- """Extract word coordinates from each page of the PDF."""
384
- word_coordinates = {}
385
- for page_num, page in enumerate(doc):
386
- words = page.get_text("words")
387
- word_coordinates[page_num] = [
388
- {
389
- "text": word[4],
390
- "rect": fitz.Rect(word[:4]),
391
- "origin": word[5:],
392
- }
393
- for word in words
394
- ]
395
- return word_coordinates
396
 
397
- def find_text_location(text: str, word_coordinates: Dict[int, List[Dict[str, Any]]]) -> Optional[Highlight]:
398
- """Find the location of text in the PDF and return a Highlight object."""
399
- text_lower = text.lower()
400
- for page_num, words in word_coordinates.items():
401
- for i in range(len(words)):
402
- if words[i]["text"].lower() in text_lower:
403
- # Find the complete phrase
404
- rect = words[i]["rect"]
405
- j = i + 1
406
- while j < len(words) and j - i < len(text.split()):
407
- rect = rect | words[j]["rect"]
408
- j += 1
409
-
410
- return Highlight(
411
- page=page_num,
412
- rect=(rect.x0, rect.y0, rect.x1, rect.y1),
413
- color="yellow",
414
- message=text,
415
- category="text"
416
- )
417
- return None
418
-
419
- # ------------------------------
420
- # Streamlit Interface
421
- # ------------------------------
422
-
423
- def create_sidebar():
424
- """Create the sidebar with upload and analysis options."""
425
- st.sidebar.title("PDF Analyzer")
426
- uploaded_file = st.sidebar.file_uploader("Upload PDF", type=['pdf'])
427
-
428
- analysis_options = st.sidebar.expander("Analysis Options", expanded=False)
429
- with analysis_options:
430
- options = {
431
- "check_language": st.checkbox("Language Analysis", value=True),
432
- "check_references": st.checkbox("Reference Analysis", value=True),
433
- "check_structure": st.checkbox("Structure Analysis", value=True),
434
- }
435
-
436
- return uploaded_file, options
437
-
438
- def display_pdf_viewer(pdf_bytes: bytes, highlights: List[Highlight]):
439
- """Display the PDF with highlights using a custom viewer."""
440
- # Convert PDF bytes to base64
441
- b64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
442
-
443
- # Create custom HTML for PDF viewer
444
- html_content = f"""
445
  <div style="position: relative; width: 100%; height: 800px;">
446
- <iframe src="data:application/pdf;base64,{b64_pdf}"
447
  width="100%"
448
  height="100%"
449
  style="border: none;">
450
  </iframe>
451
- <div id="highlight-container">
452
- {generate_highlight_overlays(highlights)}
453
- </div>
454
  </div>
455
- <style>
456
- .highlight {{
457
- position: absolute;
458
- opacity: 0.3;
459
- pointer-events: all;
460
- cursor: pointer;
461
- transition: opacity 0.2s;
462
- }}
463
- .highlight:hover {{
464
- opacity: 0.5;
465
- }}
466
- </style>
467
  """
468
-
469
- st.components.v1.html(html_content, height=800)
470
 
471
- def generate_highlight_overlays(highlights: List[Highlight]) -> str:
472
- """Generate HTML for highlight overlays."""
473
- overlay_html = ""
474
- for i, highlight in enumerate(highlights):
475
- overlay_html += f"""
476
- <div class="highlight"
477
- style="left: {highlight.rect[0]}px;
478
- top: {highlight.rect[1]}px;
479
- width: {highlight.rect[2] - highlight.rect[0]}px;
480
- height: {highlight.rect[3] - highlight.rect[1]}px;
481
- background-color: {highlight.color};"
482
- onclick="showMessage({i})"
483
- title="{highlight.message}">
484
- </div>
485
- """
486
- return overlay_html
487
 
488
- def display_analysis_results(results: AnalysisResult):
489
- """Display analysis results in the sidebar."""
 
 
 
 
 
 
 
 
 
 
 
 
490
  st.sidebar.markdown("## Analysis Results")
491
 
492
- # Display summary statistics
493
- st.sidebar.markdown("### Summary")
494
- for key, value in results.summary.items():
495
- st.sidebar.metric(key, value)
 
 
 
496
 
497
- # Display messages grouped by category
498
- messages_by_category = {}
499
- for message in results.messages:
500
- category = message.get("category", "Other")
501
- if category not in messages_by_category:
502
- messages_by_category[category] = []
503
- messages_by_category[category].append(message)
 
 
 
 
 
 
 
 
 
504
 
505
- for category, messages in messages_by_category.items():
506
- with st.sidebar.expander(f"{category} ({len(messages)})"):
507
- for msg in messages:
508
- st.markdown(f"**{msg['title']}**")
509
- st.markdown(msg['description'])
510
- st.markdown("---")
 
 
 
 
511
 
512
  def main():
513
  st.set_page_config(
@@ -516,45 +188,112 @@ def main():
516
  layout="wide",
517
  initial_sidebar_state="expanded"
518
  )
519
-
520
- # Create sidebar and get user input
521
- uploaded_file, options = create_sidebar()
522
-
 
 
 
 
 
 
 
 
 
 
523
  if uploaded_file is not None:
524
- # Read PDF file
525
- pdf_bytes = uploaded_file.read()
526
-
527
- # Analyze PDF
528
  try:
529
- results, annotated_pdf = analyze_pdf(io.BytesIO(pdf_bytes))
 
530
 
531
- # Create two columns
532
  col1, col2 = st.columns([0.7, 0.3])
533
-
534
  with col1:
535
  st.markdown("### Document Preview")
536
- # Display PDF with highlights
537
- if annotated_pdf:
538
- display_pdf_viewer(annotated_pdf, results.get("highlights", []))
539
- else:
540
- display_pdf_viewer(pdf_bytes, [])
541
-
542
  with col2:
543
- st.markdown("### Analysis Details")
544
- display_analysis_results(results)
545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  except Exception as e:
547
- st.error(f"Error analyzing PDF: {str(e)}")
548
  st.code(traceback.format_exc())
 
549
  else:
550
- st.markdown("### Upload a PDF to begin analysis")
551
  st.markdown("""
552
- This tool will analyze your PDF document for:
553
- - Language issues and grammar
554
- - Reference formatting and consistency
555
- - Document structure
556
- - Figure and table placement
 
 
 
 
 
557
  """)
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  if __name__ == "__main__":
 
560
  main()
 
4
  from pdfminer.high_level import extract_text
5
  from pdfminer.layout import LAParams
6
  import language_tool_python
7
+ from typing import List, Dict, Any, Tuple
8
  from collections import Counter
9
  import json
10
  import traceback
 
12
  import tempfile
13
  import os
14
  import base64
 
15
 
16
  # Set JAVA_HOME environment variable
17
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # ------------------------------
20
  # Analysis Functions
21
  # ------------------------------
 
41
  """Checks for the presence of required terms in the text."""
42
  return {term: term.lower() in full_text.lower() for term in search_terms}
43
 
 
 
 
 
 
 
 
 
 
44
  def check_metadata(full_text: str) -> Dict[str, Any]:
45
  """Check for metadata elements."""
46
  return {
47
  "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
48
  "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
49
  "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
50
+ "word_count": len(full_text.split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
52
 
53
  def check_language_issues(full_text: str) -> Dict[str, Any]:
54
+ """Check for language issues."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  try:
56
+ language_tool = language_tool_python.LanguageTool('en-US')
57
+ matches = language_tool.check(full_text)
58
+
59
+ issues = []
60
+ for match in matches:
61
+ issues.append({
62
+ "message": match.message,
63
+ "context": match.context,
64
+ "suggestions": match.replacements[:3] if match.replacements else [],
65
+ "category": match.category,
66
+ "rule_id": match.ruleId
67
+ })
68
+
69
+ return {
70
+ "total_issues": len(issues),
71
+ "issues": issues
72
+ }
 
 
 
 
 
 
 
 
 
 
 
 
73
  except Exception as e:
74
+ return {
75
+ "total_issues": 0,
76
+ "issues": [],
77
+ "error": str(e)
78
+ }
 
 
79
 
80
+ def analyze_pdf(file) -> Dict[str, Any]:
81
+ """Main analysis function."""
 
 
82
  try:
83
+ # Extract text
84
  full_text = extract_pdf_text(file)
85
+
86
+ # Perform analysis
 
 
 
 
 
 
 
 
 
 
 
 
87
  results = {
88
+ "metadata": check_metadata(full_text),
89
+ "language": {
90
+ "issues": check_language_issues(full_text)
91
+ },
92
+ "structure": {
93
+ "has_abstract": bool(re.search(r'\bAbstract\b', full_text, re.IGNORECASE)),
94
+ "has_introduction": bool(re.search(r'\bIntroduction\b', full_text, re.IGNORECASE)),
95
+ "has_conclusion": bool(re.search(r'\bConclusion\b', full_text, re.IGNORECASE))
96
+ }
97
  }
98
+
99
+ return results
100
+
 
 
 
 
 
 
 
 
 
101
  except Exception as e:
102
+ return {"error": str(e), "traceback": traceback.format_exc()}
 
 
 
 
 
103
 
104
  # ------------------------------
105
+ # PDF Display Functions
106
  # ------------------------------
107
 
108
+ def display_pdf(pdf_bytes):
109
+ """Display PDF in Streamlit."""
110
+ base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
111
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800" type="application/pdf"></iframe>'
112
+ st.markdown(pdf_display, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
113
 
114
+ def get_pdf_display_html(pdf_bytes):
115
+ """Generate HTML for PDF display with highlight container."""
116
+ base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
117
+ return f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  <div style="position: relative; width: 100%; height: 800px;">
119
+ <iframe src="data:application/pdf;base64,{base64_pdf}"
120
  width="100%"
121
  height="100%"
122
  style="border: none;">
123
  </iframe>
124
+ <div id="highlight-container"></div>
 
 
125
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
126
  """
 
 
127
 
128
+ # ------------------------------
129
+ # Streamlit Interface Functions
130
+ # ------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ def render_sidebar():
133
+ """Render the sidebar with analysis options."""
134
+ st.sidebar.title("PDF Analysis Options")
135
+
136
+ options = {
137
+ "check_language": st.sidebar.checkbox("Check Language", value=True),
138
+ "check_structure": st.sidebar.checkbox("Check Structure", value=True),
139
+ "check_metadata": st.sidebar.checkbox("Check Metadata", value=True)
140
+ }
141
+
142
+ return options
143
+
144
+ def display_analysis_results(results: Dict[str, Any]):
145
+ """Display analysis results in an organized manner."""
146
  st.sidebar.markdown("## Analysis Results")
147
 
148
+ # Display metadata results
149
+ if "metadata" in results:
150
+ with st.sidebar.expander("📋 Metadata Analysis", expanded=True):
151
+ metadata = results["metadata"]
152
+ st.markdown(f"**Word Count:** {metadata['word_count']}")
153
+ st.markdown(f"**Has Author List:** {'✅' if metadata['list_of_authors'] else '❌'}")
154
+ st.markdown(f"**Has Keywords:** {'✅' if metadata['keywords_list'] else '❌'}")
155
 
156
+ # Display language issues
157
+ if "language" in results and "issues" in results["language"]:
158
+ with st.sidebar.expander("🔤 Language Issues", expanded=True):
159
+ issues = results["language"]["issues"]
160
+ st.markdown(f"**Total Issues Found:** {issues['total_issues']}")
161
+
162
+ if issues['total_issues'] > 0:
163
+ for idx, issue in enumerate(issues['issues'], 1):
164
+ st.markdown(f"""
165
+ **Issue {idx}:**
166
+ - Type: {issue['category']}
167
+ - Message: {issue['message']}
168
+ - Context: {issue['context']}
169
+ - Suggestions: {', '.join(issue['suggestions']) if issue['suggestions'] else 'None'}
170
+ ---
171
+ """)
172
 
173
+ # Display structure analysis
174
+ if "structure" in results:
175
+ with st.sidebar.expander("🏗️ Structure Analysis", expanded=True):
176
+ structure = results["structure"]
177
+ st.markdown(f"**Has Abstract:** {'✅' if structure['has_abstract'] else '❌'}")
178
+ st.markdown(f"**Has Introduction:** {'✅' if structure['has_introduction'] else '❌'}")
179
+ st.markdown(f"**Has Conclusion:** {'✅' if structure['has_conclusion'] else '❌'}")
180
+ # ------------------------------
181
+ # Main Application
182
+ # ------------------------------
183
 
184
  def main():
185
  st.set_page_config(
 
188
  layout="wide",
189
  initial_sidebar_state="expanded"
190
  )
191
+
192
+ # Main title
193
+ st.title("PDF Document Analyzer")
194
+ st.markdown("""
195
+ Upload a PDF document to analyze its structure, language, and metadata.
196
+ The analysis results will appear in the sidebar, and any issues found will be highlighted in the document.
197
+ """)
198
+
199
+ # Get analysis options from sidebar
200
+ options = render_sidebar()
201
+
202
+ # File uploader
203
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
204
+
205
  if uploaded_file is not None:
 
 
 
 
206
  try:
207
+ # Read PDF file
208
+ pdf_bytes = uploaded_file.read()
209
 
210
+ # Create two columns for layout
211
  col1, col2 = st.columns([0.7, 0.3])
212
+
213
  with col1:
214
  st.markdown("### Document Preview")
215
+ # Display PDF
216
+ display_pdf(pdf_bytes)
217
+
 
 
 
218
  with col2:
219
+ st.markdown("### Analysis Progress")
 
220
 
221
+ # Show progress bar while analyzing
222
+ with st.spinner("Analyzing PDF..."):
223
+ # Analyze PDF
224
+ results = analyze_pdf(io.BytesIO(pdf_bytes))
225
+
226
+ if "error" in results:
227
+ st.error("Error during analysis:")
228
+ st.code(results["error"])
229
+ if "traceback" in results:
230
+ with st.expander("Show error details"):
231
+ st.code(results["traceback"])
232
+ else:
233
+ st.success("Analysis complete!")
234
+
235
+ # Display summary metrics
236
+ col2_1, col2_2 = st.columns(2)
237
+ with col2_1:
238
+ st.metric(
239
+ "Language Issues",
240
+ results.get("language", {}).get("issues", {}).get("total_issues", 0)
241
+ )
242
+ with col2_2:
243
+ st.metric(
244
+ "Word Count",
245
+ results.get("metadata", {}).get("word_count", 0)
246
+ )
247
+
248
+ # Display detailed results in sidebar
249
+ display_analysis_results(results)
250
+
251
  except Exception as e:
252
+ st.error(f"An error occurred: {str(e)}")
253
  st.code(traceback.format_exc())
254
+
255
  else:
256
+ # Show instructions when no file is uploaded
257
  st.markdown("""
258
+ ### Instructions
259
+ 1. Use the sidebar to select which aspects of the document you want to analyze
260
+ 2. Upload a PDF file using the file uploader above
261
+ 3. View the analysis results in the sidebar
262
+ 4. Issues found will be highlighted in the document preview
263
+
264
+ ### Features
265
+ - **Language Analysis**: Checks for grammar, style, and clarity issues
266
+ - **Structure Analysis**: Verifies the presence of key document sections
267
+ - **Metadata Analysis**: Examines document metadata and formatting
268
  """)
269
 
270
+ # ------------------------------
271
+ # CSS Styles
272
+ # ------------------------------
273
+
274
+ def load_css():
275
+ """Load custom CSS styles."""
276
+ st.markdown("""
277
+ <style>
278
+ .highlight {
279
+ background-color: yellow;
280
+ opacity: 0.3;
281
+ position: absolute;
282
+ pointer-events: none;
283
+ }
284
+ .stButton>button {
285
+ width: 100%;
286
+ }
287
+ .sidebar .sidebar-content {
288
+ width: 100%;
289
+ }
290
+ </style>
291
+ """, unsafe_allow_html=True)
292
+
293
+ # ------------------------------
294
+ # Run Application
295
+ # ------------------------------
296
+
297
  if __name__ == "__main__":
298
+ load_css()
299
  main()