|
import os |
|
from transformers import pipeline |
|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
|
|
|
|
@st.cache(hash_funcs={pipeline: lambda _: None}) |
|
def load_model(): |
|
return pipeline("text-classification", model="fajjos/pdf_model") |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = "" |
|
try: |
|
reader = PdfReader(pdf_path) |
|
for page in reader.pages: |
|
if page.extract_text(): |
|
text += page.extract_text() |
|
except Exception as e: |
|
st.error(f"Error reading {pdf_path}: {e}") |
|
return text |
|
|
|
|
|
def search_keyword_in_pdfs(folder_path, keyword, model): |
|
pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")] |
|
matched_files = [] |
|
|
|
for pdf_file in pdf_files: |
|
pdf_path = os.path.join(folder_path, pdf_file) |
|
text = extract_text_from_pdf(pdf_path) |
|
|
|
if text and keyword.lower() in text.lower(): |
|
|
|
try: |
|
result = model(text) |
|
if any(keyword.lower() in res["label"].lower() for res in result): |
|
matched_files.append(pdf_file) |
|
except Exception as e: |
|
st.error(f"Error processing {pdf_file} with the model: {e}") |
|
return matched_files |
|
|
|
|
|
st.title("PDF Keyword Search") |
|
|
|
|
|
folder_path = st.text_input("Enter the folder path:") |
|
keyword = st.text_input("Enter the keyword to search:") |
|
|
|
|
|
if st.button("Search PDFs"): |
|
if os.path.isdir(folder_path): |
|
if keyword: |
|
st.info("Searching... Please wait.") |
|
model = load_model() |
|
matched_files = search_keyword_in_pdfs(folder_path, keyword, model) |
|
|
|
if matched_files: |
|
st.success(f"Found the keyword '{keyword}' in the following PDF(s):") |
|
for file in matched_files: |
|
st.write(f"- {file}") |
|
else: |
|
st.warning(f"No PDFs found with the keyword '{keyword}'.") |
|
else: |
|
st.error("Please enter a keyword.") |
|
else: |
|
st.error("Invalid folder path. Please enter a valid path.") |
|
|