pdf_model / app.py
fajjos's picture
Create app.py
72c8ec0 verified
raw
history blame
3.01 kB
import os
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from PyPDF2 import PdfReader
import torch
from typing import List
# Load the model and tokenizer from Hugging Face
model_name = "fajjos/pdf_model" # Replace with your model name
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_file: str) -> str:
"""
Extracts text from a single PDF file using PyPDF2.
"""
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to search for a keyword in the extracted PDF texts
def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
"""
Search for the keyword in the uploaded PDFs and return the list of PDF names.
"""
found_pdfs = []
for pdf_name, pdf_text in pdf_texts.items():
prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# If keyword is found in the response
if keyword.lower() in response.lower():
found_pdfs.append(pdf_name)
return found_pdfs
# Function to process all PDFs in a specified folder
def process_pdfs_in_folder(folder_path: str) -> dict:
"""
Extracts text from all PDFs in the specified folder and stores it in a dictionary.
"""
pdf_texts = {}
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"): # Check if the file is a PDF
file_path = os.path.join(folder_path, file_name)
pdf_texts[file_name] = extract_text_from_pdf(file_path)
return pdf_texts
# Streamlit UI for folder path and keyword input
st.title("PDF Keyword Search")
folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
keyword = st.text_input("Enter the keyword to search for:")
if st.button("Search"):
if not folder_path or not keyword:
st.error("Please provide both the folder path and the keyword.")
else:
try:
# Process all PDFs in the folder
pdf_texts = process_pdfs_in_folder(folder_path)
# Perform keyword search in the extracted texts
found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
# Display results
if found_pdfs:
st.write(f"The keyword '{keyword}' was found in the following PDF files:")
for pdf in found_pdfs:
st.write(f"- {pdf}")
else:
st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
except Exception as e:
st.error(f"Error: {e}")