pdf_model / app.py
fajjos's picture
Update app.py
a491234 verified
raw
history blame
2.39 kB
import os
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
import torch
import bitsandbytes as bnb # For 4-bit quantization
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the tokenizer and the quantized LLaMA model
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Enable 4-bit quantization
device_map="auto" if device == "cuda" else {"": "cpu"}
)
# Extract text from a PDF
def extract_text_from_pdf(pdf_file: str) -> str:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to search for a keyword in PDFs
def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
found_pdfs = []
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"):
file_path = os.path.join(folder_path, file_name)
pdf_text = extract_text_from_pdf(file_path)
# Prepare prompt for model
prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}" # Limiting input size for performance
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=200)
response = tokenizer.decode(output[0], skip_special_tokens=True)
if keyword.lower() in response.lower():
found_pdfs.append(file_name)
return found_pdfs
# Streamlit interface
st.title("PDF Keyword Search with LLaMA 4-bit Model")
folder_path = st.text_input("Enter the folder path containing PDFs:")
keyword = st.text_input("Enter the keyword to search for:")
if st.button("Search"):
if folder_path and keyword:
found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
if found_pdfs:
st.write(f"The keyword '{keyword}' was found in the following PDF files:")
for pdf in found_pdfs:
st.write(f"- {pdf}")
else:
st.write(f"The keyword '{keyword}' was not found in any PDFs.")
else:
st.error("Please provide both the folder path and the keyword.")