|
import os |
|
import streamlit as st |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
from PyPDF2 import PdfReader |
|
import torch |
|
import bitsandbytes as bnb |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_4bit=True, |
|
device_map="auto" if device == "cuda" else {"": "cpu"} |
|
) |
|
|
|
|
|
def extract_text_from_pdf(pdf_file: str) -> str: |
|
pdf_reader = PdfReader(pdf_file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list: |
|
found_pdfs = [] |
|
for file_name in os.listdir(folder_path): |
|
if file_name.endswith(".pdf"): |
|
file_path = os.path.join(folder_path, file_name) |
|
pdf_text = extract_text_from_pdf(file_path) |
|
|
|
prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}" |
|
inputs = tokenizer(prompt, return_tensors="pt").to(device) |
|
with torch.no_grad(): |
|
output = model.generate(**inputs, max_new_tokens=200) |
|
response = tokenizer.decode(output[0], skip_special_tokens=True) |
|
if keyword.lower() in response.lower(): |
|
found_pdfs.append(file_name) |
|
return found_pdfs |
|
|
|
|
|
st.title("PDF Keyword Search with LLaMA 4-bit Model") |
|
|
|
folder_path = st.text_input("Enter the folder path containing PDFs:") |
|
keyword = st.text_input("Enter the keyword to search for:") |
|
|
|
if st.button("Search"): |
|
if folder_path and keyword: |
|
found_pdfs = search_keyword_in_pdfs(keyword, folder_path) |
|
if found_pdfs: |
|
st.write(f"The keyword '{keyword}' was found in the following PDF files:") |
|
for pdf in found_pdfs: |
|
st.write(f"- {pdf}") |
|
else: |
|
st.write(f"The keyword '{keyword}' was not found in any PDFs.") |
|
else: |
|
st.error("Please provide both the folder path and the keyword.") |
|
|