File size: 2,387 Bytes
72c8ec0
 
a491234
72c8ec0
 
a491234
72c8ec0
a491234
 
 
 
 
72c8ec0
 
a491234
 
 
 
 
 
 
72c8ec0
 
 
 
 
 
 
a491234
 
72c8ec0
 
a491234
72c8ec0
a491234
 
 
 
 
 
 
 
 
 
72c8ec0
a491234
 
72c8ec0
a491234
72c8ec0
 
 
a491234
 
 
 
 
 
 
 
72c8ec0
a491234
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
import torch
import bitsandbytes as bnb  # For 4-bit quantization

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and the quantized LLaMA model
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,    # Enable 4-bit quantization
    device_map="auto" if device == "cuda" else {"": "cpu"}
)

# Extract text from a PDF
def extract_text_from_pdf(pdf_file: str) -> str:
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to search for a keyword in PDFs
def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
    found_pdfs = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(folder_path, file_name)
            pdf_text = extract_text_from_pdf(file_path)
            # Prepare prompt for model
            prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}"  # Limiting input size for performance
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            with torch.no_grad():
                output = model.generate(**inputs, max_new_tokens=200)
            response = tokenizer.decode(output[0], skip_special_tokens=True)
            if keyword.lower() in response.lower():
                found_pdfs.append(file_name)
    return found_pdfs

# Streamlit interface
st.title("PDF Keyword Search with LLaMA 4-bit Model")

folder_path = st.text_input("Enter the folder path containing PDFs:")
keyword = st.text_input("Enter the keyword to search for:")

if st.button("Search"):
    if folder_path and keyword:
        found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
        if found_pdfs:
            st.write(f"The keyword '{keyword}' was found in the following PDF files:")
            for pdf in found_pdfs:
                st.write(f"- {pdf}")
        else:
            st.write(f"The keyword '{keyword}' was not found in any PDFs.")
    else:
        st.error("Please provide both the folder path and the keyword.")