pdf_model / app.py
fajjos's picture
Update app.py
c4accb1 verified
raw
history blame
2.41 kB
import os
from transformers import pipeline
import streamlit as st
from PyPDF2 import PdfReader
# Initialize the Hugging Face model pipeline
@st.cache(hash_funcs={pipeline: lambda _: None}) # Allow caching without hashing the model
def load_model():
return pipeline("text-classification", model="fajjos/pdf_model")
# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
text = ""
try:
reader = PdfReader(pdf_path)
for page in reader.pages:
if page.extract_text(): # Ensure text is not None
text += page.extract_text()
except Exception as e:
st.error(f"Error reading {pdf_path}: {e}")
return text
# Search for the keyword in PDF files
def search_keyword_in_pdfs(folder_path, keyword, model):
pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
matched_files = []
for pdf_file in pdf_files:
pdf_path = os.path.join(folder_path, pdf_file)
text = extract_text_from_pdf(pdf_path)
if text and keyword.lower() in text.lower(): # Case-insensitive search
# Use the Hugging Face model for additional validation or relevance
try:
result = model(text)
if any(keyword.lower() in res["label"].lower() for res in result):
matched_files.append(pdf_file)
except Exception as e:
st.error(f"Error processing {pdf_file} with the model: {e}")
return matched_files
# Streamlit App UI
st.title("PDF Keyword Search")
# User Inputs
folder_path = st.text_input("Enter the folder path:")
keyword = st.text_input("Enter the keyword to search:")
# Button to perform the search
if st.button("Search PDFs"):
if os.path.isdir(folder_path):
if keyword:
st.info("Searching... Please wait.")
model = load_model() # Load the model
matched_files = search_keyword_in_pdfs(folder_path, keyword, model)
if matched_files:
st.success(f"Found the keyword '{keyword}' in the following PDF(s):")
for file in matched_files:
st.write(f"- {file}")
else:
st.warning(f"No PDFs found with the keyword '{keyword}'.")
else:
st.error("Please enter a keyword.")
else:
st.error("Invalid folder path. Please enter a valid path.")