awacke1 commited on
Commit
cabea79
·
verified ·
1 Parent(s): a18e29e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -24
app.py CHANGED
@@ -6,7 +6,10 @@ import dotenv
6
  import pandas as pd
7
  import streamlit as st
8
  from streamlit_tags import st_tags
9
- from PyPDF2 import PdfReader, PdfWriter
 
 
 
10
  from presidio_helpers import (
11
  analyzer_engine,
12
  get_supported_entities,
@@ -76,6 +79,41 @@ with st.sidebar.expander("Allowlists and denylists", expanded=False):
76
  st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
77
  st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # Main panel
80
  col1, col2 = st.columns(2)
81
 
@@ -86,10 +124,9 @@ with col1:
86
  if uploaded_file:
87
  try:
88
  # Read PDF
89
- pdf_reader = PdfReader(uploaded_file)
90
- text = ""
91
- for page in pdf_reader.pages:
92
- text += page.extract_text() + "\n"
93
 
94
  # Initialize analyzer
95
  try:
@@ -125,31 +162,20 @@ with col1:
125
  analyze_results=st_analyze_results,
126
  )
127
 
128
- # Create new PDF
129
- pdf_writer = PdfWriter()
130
- for page in pdf_reader.pages:
131
- pdf_writer.add_page(page)
132
-
133
  # Generate output filename with timestamp
134
- timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
135
  output_filename = f"{timestamp}_{uploaded_file.name}"
136
 
137
- # Save modified PDF
138
- try:
139
- with open(output_filename, "wb") as f:
140
- pdf_writer.write(f)
141
- except PermissionError as e:
142
- st.error(f"Permission denied when saving PDF: {str(e)}")
143
- st.info("Check write permissions in the current directory.")
144
- raise
145
 
146
  # Generate base64 download link
147
  try:
148
- with open(output_filename, "rb") as f:
149
- pdf_bytes = f.read()
150
- b64 = base64.b64encode(pdf_bytes).decode()
151
- href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
152
- st.markdown(href, unsafe_allow_html=True)
153
  except Exception as e:
154
  st.error(f"Error generating download link: {str(e)}")
155
  raise
 
6
  import pandas as pd
7
  import streamlit as st
8
  from streamlit_tags import st_tags
9
+ import fitz
10
+ from reportlab.lib.pagesizes import letter
11
+ from reportlab.platypus import SimpleDocTemplate, Paragraph
12
+ from reportlab.lib.styles import getSampleStyleSheet
13
  from presidio_helpers import (
14
  analyzer_engine,
15
  get_supported_entities,
 
79
  st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
80
  st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
81
 
82
+ # PDF processing functions
83
+ def get_timestamp_prefix():
84
+ central = pytz.timezone("US/Central")
85
+ now = datetime.now(central)
86
+ return now.strftime("%I%M%p_%d-%m-%y").upper()
87
+
88
+ def read_pdf(pdf_file):
89
+ """Read text from a PDF using fitz (PyMuPDF)."""
90
+ try:
91
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
92
+ text = ""
93
+ for page in doc:
94
+ text += page.get_text() + "\n"
95
+ doc.close()
96
+ return text
97
+ except Exception as e:
98
+ st.error(f"Failed to read PDF: {str(e)}")
99
+ return None
100
+
101
+ def create_pdf(text, output_filename):
102
+ """Create a PDF with anonymized text using reportlab."""
103
+ try:
104
+ buffer = io.BytesIO()
105
+ doc = SimpleDocTemplate(buffer, pagesize=letter)
106
+ styles = getSampleStyleSheet()
107
+ story = [Paragraph(text.replace("\n", "<br/>"), styles["Normal"])]
108
+ doc.build(story)
109
+ buffer.seek(0)
110
+ with open(output_filename, "wb") as f:
111
+ f.write(buffer.getvalue())
112
+ return buffer.getvalue()
113
+ except Exception as e:
114
+ st.error(f"Failed to create PDF: {str(e)}")
115
+ return None
116
+
117
  # Main panel
118
  col1, col2 = st.columns(2)
119
 
 
124
  if uploaded_file:
125
  try:
126
  # Read PDF
127
+ text = read_pdf(uploaded_file)
128
+ if not text:
129
+ raise ValueError("No text extracted from PDF")
 
130
 
131
  # Initialize analyzer
132
  try:
 
162
  analyze_results=st_analyze_results,
163
  )
164
 
 
 
 
 
 
165
  # Generate output filename with timestamp
166
+ timestamp = get_timestamp_prefix()
167
  output_filename = f"{timestamp}_{uploaded_file.name}"
168
 
169
+ # Create new PDF
170
+ pdf_bytes = create_pdf(anonymized_result.text, output_filename)
171
+ if not pdf_bytes:
172
+ raise ValueError("Failed to generate PDF")
 
 
 
 
173
 
174
  # Generate base64 download link
175
  try:
176
+ b64 = base64.b64encode(pdf_bytes).decode()
177
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
178
+ st.markdown(href, unsafe_allow_html=True)
 
 
179
  except Exception as e:
180
  st.error(f"Error generating download link: {str(e)}")
181
  raise