File size: 3,014 Bytes
72c8ec0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from PyPDF2 import PdfReader
import torch
from typing import List
# Load the model and tokenizer from Hugging Face
model_name = "fajjos/pdf_model" # Replace with your model name
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_file: str) -> str:
"""
Extracts text from a single PDF file using PyPDF2.
"""
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to search for a keyword in the extracted PDF texts
def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
"""
Search for the keyword in the uploaded PDFs and return the list of PDF names.
"""
found_pdfs = []
for pdf_name, pdf_text in pdf_texts.items():
prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# If keyword is found in the response
if keyword.lower() in response.lower():
found_pdfs.append(pdf_name)
return found_pdfs
# Function to process all PDFs in a specified folder
def process_pdfs_in_folder(folder_path: str) -> dict:
"""
Extracts text from all PDFs in the specified folder and stores it in a dictionary.
"""
pdf_texts = {}
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"): # Check if the file is a PDF
file_path = os.path.join(folder_path, file_name)
pdf_texts[file_name] = extract_text_from_pdf(file_path)
return pdf_texts
# Streamlit UI for folder path and keyword input
st.title("PDF Keyword Search")
folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
keyword = st.text_input("Enter the keyword to search for:")
if st.button("Search"):
if not folder_path or not keyword:
st.error("Please provide both the folder path and the keyword.")
else:
try:
# Process all PDFs in the folder
pdf_texts = process_pdfs_in_folder(folder_path)
# Perform keyword search in the extracted texts
found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
# Display results
if found_pdfs:
st.write(f"The keyword '{keyword}' was found in the following PDF files:")
for pdf in found_pdfs:
st.write(f"- {pdf}")
else:
st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
except Exception as e:
st.error(f"Error: {e}")
|