|
import os |
|
import streamlit as st |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from PyPDF2 import PdfReader |
|
import torch |
|
from typing import List |
|
|
|
|
|
model_name = "fajjos/pdf_model" |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
def extract_text_from_pdf(pdf_file: str) -> str: |
|
""" |
|
Extracts text from a single PDF file using PyPDF2. |
|
""" |
|
pdf_reader = PdfReader(pdf_file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]: |
|
""" |
|
Search for the keyword in the uploaded PDFs and return the list of PDF names. |
|
""" |
|
found_pdfs = [] |
|
for pdf_name, pdf_text in pdf_texts.items(): |
|
prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}" |
|
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") |
|
outputs = model.generate(inputs.input_ids, max_new_tokens=20000) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
if keyword.lower() in response.lower(): |
|
found_pdfs.append(pdf_name) |
|
return found_pdfs |
|
|
|
|
|
def process_pdfs_in_folder(folder_path: str) -> dict: |
|
""" |
|
Extracts text from all PDFs in the specified folder and stores it in a dictionary. |
|
""" |
|
pdf_texts = {} |
|
for file_name in os.listdir(folder_path): |
|
if file_name.endswith(".pdf"): |
|
file_path = os.path.join(folder_path, file_name) |
|
pdf_texts[file_name] = extract_text_from_pdf(file_path) |
|
return pdf_texts |
|
|
|
|
|
st.title("PDF Keyword Search") |
|
|
|
folder_path = st.text_input("Enter the folder path containing PDFs:").strip() |
|
keyword = st.text_input("Enter the keyword to search for:") |
|
|
|
if st.button("Search"): |
|
if not folder_path or not keyword: |
|
st.error("Please provide both the folder path and the keyword.") |
|
else: |
|
try: |
|
|
|
pdf_texts = process_pdfs_in_folder(folder_path) |
|
|
|
|
|
found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts) |
|
|
|
|
|
if found_pdfs: |
|
st.write(f"The keyword '{keyword}' was found in the following PDF files:") |
|
for pdf in found_pdfs: |
|
st.write(f"- {pdf}") |
|
else: |
|
st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.") |
|
except Exception as e: |
|
st.error(f"Error: {e}") |
|
|