samyak152002 commited on
Commit
99dc100
·
verified ·
1 Parent(s): 40e8eb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -278
app.py CHANGED
@@ -1,299 +1,66 @@
 
 
1
  import streamlit as st
2
- import re
3
- import fitz # PyMuPDF
4
- from pdfminer.high_level import extract_text
5
- from pdfminer.layout import LAParams
6
- import language_tool_python
7
- from typing import List, Dict, Any, Tuple
8
- from collections import Counter
9
- import json
10
- import traceback
11
- import io
12
  import tempfile
13
  import os
14
- import base64
15
-
16
- # Set JAVA_HOME environment variable
17
- os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
18
-
19
- # ------------------------------
20
- # Analysis Functions
21
- # ------------------------------
22
-
23
- def extract_pdf_text_by_page(file) -> List[str]:
24
- """Extracts text from a PDF file, page by page, using PyMuPDF."""
25
- if isinstance(file, str):
26
- with fitz.open(file) as doc:
27
- return [page.get_text("text") for page in doc]
28
- else:
29
- with fitz.open(stream=file.read(), filetype="pdf") as doc:
30
- return [page.get_text("text") for page in doc]
31
-
32
- def extract_pdf_text(file) -> str:
33
- """Extracts text from a PDF file using pdfminer."""
34
- if isinstance(file, str):
35
- with open(file, 'rb') as f:
36
- return extract_text(f, laparams=LAParams())
37
- else:
38
- return extract_text(file, laparams=LAParams())
39
-
40
- def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
41
- """Checks for the presence of required terms in the text."""
42
- return {term: term.lower() in full_text.lower() for term in search_terms}
43
-
44
- def check_metadata(full_text: str) -> Dict[str, Any]:
45
- """Check for metadata elements."""
46
- return {
47
- "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
48
- "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
49
- "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
50
- "word_count": len(full_text.split())
51
- }
52
-
53
- def check_language_issues(full_text: str) -> Dict[str, Any]:
54
- """Check for language issues."""
55
- try:
56
- language_tool = language_tool_python.LanguageTool('en-US')
57
- matches = language_tool.check(full_text)
58
-
59
- issues = []
60
- for match in matches:
61
- issues.append({
62
- "message": match.message,
63
- "context": match.context,
64
- "suggestions": match.replacements[:3] if match.replacements else [],
65
- "category": match.category,
66
- "rule_id": match.ruleId
67
- })
68
-
69
- return {
70
- "total_issues": len(issues),
71
- "issues": issues
72
- }
73
- except Exception as e:
74
- return {
75
- "total_issues": 0,
76
- "issues": [],
77
- "error": str(e)
78
- }
79
-
80
- def analyze_pdf(file) -> Dict[str, Any]:
81
- """Main analysis function."""
82
- try:
83
- # Extract text
84
- full_text = extract_pdf_text(file)
85
-
86
- # Perform analysis
87
- results = {
88
- "metadata": check_metadata(full_text),
89
- "language": {
90
- "issues": check_language_issues(full_text)
91
- },
92
- "structure": {
93
- "has_abstract": bool(re.search(r'\bAbstract\b', full_text, re.IGNORECASE)),
94
- "has_introduction": bool(re.search(r'\bIntroduction\b', full_text, re.IGNORECASE)),
95
- "has_conclusion": bool(re.search(r'\bConclusion\b', full_text, re.IGNORECASE))
96
- }
97
- }
98
-
99
- return results
100
-
101
- except Exception as e:
102
- return {"error": str(e), "traceback": traceback.format_exc()}
103
-
104
- # ------------------------------
105
- # PDF Display Functions
106
- # ------------------------------
107
 
108
  def display_pdf(pdf_bytes):
109
- """Display PDF in Streamlit."""
110
- base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
111
- pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800" type="application/pdf"></iframe>'
112
- st.markdown(pdf_display, unsafe_allow_html=True)
113
-
114
- def get_pdf_display_html(pdf_bytes):
115
- """Generate HTML for PDF display with highlight container."""
116
- base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
117
- return f"""
118
- <div style="position: relative; width: 100%; height: 800px;">
119
- <iframe src="data:application/pdf;base64,{base64_pdf}"
120
- width="100%"
121
- height="100%"
122
- style="border: none;">
123
- </iframe>
124
- <div id="highlight-container"></div>
125
- </div>
126
- """
127
-
128
- # ------------------------------
129
- # Streamlit Interface Functions
130
- # ------------------------------
131
-
132
- def render_sidebar():
133
- """Render the sidebar with analysis options."""
134
- st.sidebar.title("PDF Analysis Options")
135
-
136
- options = {
137
- "check_language": st.sidebar.checkbox("Check Language", value=True),
138
- "check_structure": st.sidebar.checkbox("Check Structure", value=True),
139
- "check_metadata": st.sidebar.checkbox("Check Metadata", value=True)
140
- }
141
-
142
- return options
143
-
144
- def display_analysis_results(results: Dict[str, Any]):
145
- """Display analysis results in an organized manner."""
146
- st.sidebar.markdown("## Analysis Results")
147
-
148
- # Display metadata results
149
- if "metadata" in results:
150
- with st.sidebar.expander("📋 Metadata Analysis", expanded=True):
151
- metadata = results["metadata"]
152
- st.markdown(f"**Word Count:** {metadata['word_count']}")
153
- st.markdown(f"**Has Author List:** {'✅' if metadata['list_of_authors'] else '❌'}")
154
- st.markdown(f"**Has Keywords:** {'✅' if metadata['keywords_list'] else '❌'}")
155
-
156
- # Display language issues
157
- if "language" in results and "issues" in results["language"]:
158
- with st.sidebar.expander("🔤 Language Issues", expanded=True):
159
- issues = results["language"]["issues"]
160
- st.markdown(f"**Total Issues Found:** {issues['total_issues']}")
161
-
162
- if issues['total_issues'] > 0:
163
- for idx, issue in enumerate(issues['issues'], 1):
164
- st.markdown(f"""
165
- **Issue {idx}:**
166
- - Type: {issue['category']}
167
- - Message: {issue['message']}
168
- - Context: {issue['context']}
169
- - Suggestions: {', '.join(issue['suggestions']) if issue['suggestions'] else 'None'}
170
- ---
171
- """)
172
-
173
- # Display structure analysis
174
- if "structure" in results:
175
- with st.sidebar.expander("🏗️ Structure Analysis", expanded=True):
176
- structure = results["structure"]
177
- st.markdown(f"**Has Abstract:** {'✅' if structure['has_abstract'] else '❌'}")
178
- st.markdown(f"**Has Introduction:** {'✅' if structure['has_introduction'] else '❌'}")
179
- st.markdown(f"**Has Conclusion:** {'✅' if structure['has_conclusion'] else '❌'}")
180
- # ------------------------------
181
- # Main Application
182
- # ------------------------------
183
 
184
  def main():
185
  st.set_page_config(
186
  page_title="PDF Analyzer",
187
  page_icon="📄",
188
  layout="wide",
189
- initial_sidebar_state="expanded"
190
  )
191
 
192
- # Main title
193
- st.title("PDF Document Analyzer")
194
  st.markdown("""
195
- Upload a PDF document to analyze its structure, language, and metadata.
196
- The analysis results will appear in the sidebar, and any issues found will be highlighted in the document.
197
  """)
198
 
199
- # Get analysis options from sidebar
200
- options = render_sidebar()
201
-
202
- # File uploader
203
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
204
 
205
  if uploaded_file is not None:
206
- try:
207
- # Read PDF file
208
- pdf_bytes = uploaded_file.read()
209
-
210
- # Create two columns for layout
211
- col1, col2 = st.columns([0.7, 0.3])
212
-
213
- with col1:
214
- st.markdown("### Document Preview")
215
- # Display PDF
216
- display_pdf(pdf_bytes)
217
-
218
- with col2:
219
- st.markdown("### Analysis Progress")
220
-
221
- # Show progress bar while analyzing
222
- with st.spinner("Analyzing PDF..."):
223
- # Analyze PDF
224
- results = analyze_pdf(io.BytesIO(pdf_bytes))
225
-
226
- if "error" in results:
227
- st.error("Error during analysis:")
228
- st.code(results["error"])
229
- if "traceback" in results:
230
- with st.expander("Show error details"):
231
- st.code(results["traceback"])
232
- else:
233
- st.success("Analysis complete!")
234
-
235
- # Display summary metrics
236
- col2_1, col2_2 = st.columns(2)
237
- with col2_1:
238
- st.metric(
239
- "Language Issues",
240
- results.get("language", {}).get("issues", {}).get("total_issues", 0)
241
- )
242
- with col2_2:
243
- st.metric(
244
- "Word Count",
245
- results.get("metadata", {}).get("word_count", 0)
246
- )
247
-
248
- # Display detailed results in sidebar
249
- display_analysis_results(results)
250
-
251
- except Exception as e:
252
- st.error(f"An error occurred: {str(e)}")
253
- st.code(traceback.format_exc())
254
-
255
- else:
256
- # Show instructions when no file is uploaded
257
- st.markdown("""
258
- ### Instructions
259
- 1. Use the sidebar to select which aspects of the document you want to analyze
260
- 2. Upload a PDF file using the file uploader above
261
- 3. View the analysis results in the sidebar
262
- 4. Issues found will be highlighted in the document preview
263
-
264
- ### Features
265
- - **Language Analysis**: Checks for grammar, style, and clarity issues
266
- - **Structure Analysis**: Verifies the presence of key document sections
267
- - **Metadata Analysis**: Examines document metadata and formatting
268
- """)
269
-
270
- # ------------------------------
271
- # CSS Styles
272
- # ------------------------------
273
-
274
- def load_css():
275
- """Load custom CSS styles."""
276
- st.markdown("""
277
- <style>
278
- .highlight {
279
- background-color: yellow;
280
- opacity: 0.3;
281
- position: absolute;
282
- pointer-events: none;
283
- }
284
- .stButton>button {
285
- width: 100%;
286
- }
287
- .sidebar .sidebar-content {
288
- width: 100%;
289
- }
290
- </style>
291
- """, unsafe_allow_html=True)
292
-
293
- # ------------------------------
294
- # Run Application
295
- # ------------------------------
296
 
297
  if __name__ == "__main__":
298
- load_css()
299
  main()
 
1
+ # app.py
2
+
3
  import streamlit as st
4
+ import base64
5
+ from annotations import analyze_pdf
 
 
 
 
 
 
 
 
6
  import tempfile
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def display_pdf(pdf_bytes):
10
+ """Displays the PDF in the browser using an iframe."""
11
+ if pdf_bytes:
12
+ base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
13
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800px" type="application/pdf"></iframe>'
14
+ st.markdown(pdf_display, unsafe_allow_html=True)
15
+ else:
16
+ st.info("No annotated PDF to display.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def main():
19
  st.set_page_config(
20
  page_title="PDF Analyzer",
21
  page_icon="📄",
22
  layout="wide",
 
23
  )
24
 
25
+ st.title("📄 PDF Analyzer")
 
26
  st.markdown("""
27
+ Upload a PDF to analyze its language, highlight errors, and view detailed error reports.
 
28
  """)
29
 
30
+ uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
 
 
 
 
31
 
32
  if uploaded_file is not None:
33
+ with st.spinner("Analyzing PDF..."):
34
+ language_results, annotated_pdf = analyze_pdf(uploaded_file)
35
+
36
+ if "error" in language_results:
37
+ st.error("An error occurred during analysis:")
38
+ st.code(language_results["error"])
39
+ else:
40
+ st.success("Analysis complete!")
41
+
42
+ # Display the annotated PDF
43
+ st.subheader("📄 Annotated PDF")
44
+ display_pdf(annotated_pdf)
45
+
46
+ # Sidebar for error details
47
+ st.sidebar.header("📝 Error Details")
48
+
49
+ if language_results.get("total_issues", 0) > 0:
50
+ for idx, issue in enumerate(language_results["issues"], 1):
51
+ with st.sidebar.expander(f"Issue {idx}"):
52
+ st.markdown(f"**Message:** {issue['message']}")
53
+ st.markdown(f"**Category:** {issue['category']}")
54
+ st.markdown(f"**Suggestions:** {', '.join(issue['suggestions']) if issue['suggestions'] else 'No suggestions'}")
55
+ st.markdown(f"**Sentence:** {issue['context']}")
56
+ else:
57
+ st.sidebar.success("No language issues found!")
58
+
59
+ # Option to download the annotated PDF
60
+ if annotated_pdf:
61
+ b64 = base64.b64encode(annotated_pdf).decode()
62
+ href = f'<a href="data:application/pdf;base64,{b64}" download="annotated.pdf">📥 Download Annotated PDF</a>'
63
+ st.markdown(href, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  if __name__ == "__main__":
 
66
  main()