presidio-de-identify

Running

App Files Files Community

awacke1 commited on Apr 14

Commit

cabea79

verified ·

1 Parent(s): a18e29e

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -24

app.py CHANGED Viewed

@@ -6,7 +6,10 @@ import dotenv
 import pandas as pd
 import streamlit as st
 from streamlit_tags import st_tags
-from PyPDF2 import PdfReader, PdfWriter
 from presidio_helpers import (
     analyzer_engine,
     get_supported_entities,
@@ -76,6 +79,41 @@ with st.sidebar.expander("Allowlists and denylists", expanded=False):
     st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
     st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
 # Main panel
 col1, col2 = st.columns(2)
@@ -86,10 +124,9 @@ with col1:
     if uploaded_file:
         try:
             # Read PDF
-            pdf_reader = PdfReader(uploaded_file)
-            text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text() + "\n"
             # Initialize analyzer
             try:
@@ -125,31 +162,20 @@ with col1:
                 analyze_results=st_analyze_results,
             )
-            # Create new PDF
-            pdf_writer = PdfWriter()
-            for page in pdf_reader.pages:
-                pdf_writer.add_page(page)
             # Generate output filename with timestamp
-            timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
             output_filename = f"{timestamp}_{uploaded_file.name}"
-            # Save modified PDF
-            try:
-                with open(output_filename, "wb") as f:
-                    pdf_writer.write(f)
-            except PermissionError as e:
-                st.error(f"Permission denied when saving PDF: {str(e)}")
-                st.info("Check write permissions in the current directory.")
-                raise
             # Generate base64 download link
             try:
-                with open(output_filename, "rb") as f:
-                    pdf_bytes = f.read()
-                    b64 = base64.b64encode(pdf_bytes).decode()
-                    href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
-                    st.markdown(href, unsafe_allow_html=True)
             except Exception as e:
                 st.error(f"Error generating download link: {str(e)}")
                 raise

 import pandas as pd
 import streamlit as st
 from streamlit_tags import st_tags
+import fitz
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
 from presidio_helpers import (
     analyzer_engine,
     get_supported_entities,
     st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
     st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
+# PDF processing functions
+def get_timestamp_prefix():
+    central = pytz.timezone("US/Central")
+    now = datetime.now(central)
+    return now.strftime("%I%M%p_%d-%m-%y").upper()
+def read_pdf(pdf_file):
+    """Read text from a PDF using fitz (PyMuPDF)."""
+    try:
+        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text() + "\n"
+        doc.close()
+        return text
+    except Exception as e:
+        st.error(f"Failed to read PDF: {str(e)}")
+        return None
+def create_pdf(text, output_filename):
+    """Create a PDF with anonymized text using reportlab."""
+    try:
+        buffer = io.BytesIO()
+        doc = SimpleDocTemplate(buffer, pagesize=letter)
+        styles = getSampleStyleSheet()
+        story = [Paragraph(text.replace("\n", "<br/>"), styles["Normal"])]
+        doc.build(story)
+        buffer.seek(0)
+        with open(output_filename, "wb") as f:
+            f.write(buffer.getvalue())
+        return buffer.getvalue()
+    except Exception as e:
+        st.error(f"Failed to create PDF: {str(e)}")
+        return None
 # Main panel
 col1, col2 = st.columns(2)
     if uploaded_file:
         try:
             # Read PDF
+            text = read_pdf(uploaded_file)
+            if not text:
+                raise ValueError("No text extracted from PDF")
             # Initialize analyzer
             try:
                 analyze_results=st_analyze_results,
             )
             # Generate output filename with timestamp
+            timestamp = get_timestamp_prefix()
             output_filename = f"{timestamp}_{uploaded_file.name}"
+            # Create new PDF
+            pdf_bytes = create_pdf(anonymized_result.text, output_filename)
+            if not pdf_bytes:
+                raise ValueError("Failed to generate PDF")
             # Generate base64 download link
             try:
+                b64 = base64.b64encode(pdf_bytes).decode()
+                href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
+                st.markdown(href, unsafe_allow_html=True)
             except Exception as e:
                 st.error(f"Error generating download link: {str(e)}")
                 raise