File size: 2,387 Bytes
72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 72c8ec0 a491234 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import os
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
import torch
import bitsandbytes as bnb # For 4-bit quantization
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the tokenizer and the quantized LLaMA model
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Enable 4-bit quantization
device_map="auto" if device == "cuda" else {"": "cpu"}
)
# Extract text from a PDF
def extract_text_from_pdf(pdf_file: str) -> str:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to search for a keyword in PDFs
def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
found_pdfs = []
for file_name in os.listdir(folder_path):
if file_name.endswith(".pdf"):
file_path = os.path.join(folder_path, file_name)
pdf_text = extract_text_from_pdf(file_path)
# Prepare prompt for model
prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}" # Limiting input size for performance
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=200)
response = tokenizer.decode(output[0], skip_special_tokens=True)
if keyword.lower() in response.lower():
found_pdfs.append(file_name)
return found_pdfs
# Streamlit interface
st.title("PDF Keyword Search with LLaMA 4-bit Model")
folder_path = st.text_input("Enter the folder path containing PDFs:")
keyword = st.text_input("Enter the keyword to search for:")
if st.button("Search"):
if folder_path and keyword:
found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
if found_pdfs:
st.write(f"The keyword '{keyword}' was found in the following PDF files:")
for pdf in found_pdfs:
st.write(f"- {pdf}")
else:
st.write(f"The keyword '{keyword}' was not found in any PDFs.")
else:
st.error("Please provide both the folder path and the keyword.")
|