fajjos commited on
Commit
c4accb1
·
verified ·
1 Parent(s): a491234

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -52
app.py CHANGED
@@ -1,62 +1,66 @@
1
  import os
 
2
  import streamlit as st
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from PyPDF2 import PdfReader
5
- import torch
6
- import bitsandbytes as bnb # For 4-bit quantization
7
 
8
- # Device configuration
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
10
 
11
- # Load the tokenizer and the quantized LLaMA model
12
- model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
13
- tokenizer = AutoTokenizer.from_pretrained(model_name)
14
-
15
- model = AutoModelForCausalLM.from_pretrained(
16
- model_name,
17
- load_in_4bit=True, # Enable 4-bit quantization
18
- device_map="auto" if device == "cuda" else {"": "cpu"}
19
- )
20
-
21
- # Extract text from a PDF
22
- def extract_text_from_pdf(pdf_file: str) -> str:
23
- pdf_reader = PdfReader(pdf_file)
24
  text = ""
25
- for page in pdf_reader.pages:
26
- text += page.extract_text()
 
 
 
 
 
27
  return text
28
 
29
- # Function to search for a keyword in PDFs
30
- def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
31
- found_pdfs = []
32
- for file_name in os.listdir(folder_path):
33
- if file_name.endswith(".pdf"):
34
- file_path = os.path.join(folder_path, file_name)
35
- pdf_text = extract_text_from_pdf(file_path)
36
- # Prepare prompt for model
37
- prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}" # Limiting input size for performance
38
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
39
- with torch.no_grad():
40
- output = model.generate(**inputs, max_new_tokens=200)
41
- response = tokenizer.decode(output[0], skip_special_tokens=True)
42
- if keyword.lower() in response.lower():
43
- found_pdfs.append(file_name)
44
- return found_pdfs
45
-
46
- # Streamlit interface
47
- st.title("PDF Keyword Search with LLaMA 4-bit Model")
48
-
49
- folder_path = st.text_input("Enter the folder path containing PDFs:")
50
- keyword = st.text_input("Enter the keyword to search for:")
51
-
52
- if st.button("Search"):
53
- if folder_path and keyword:
54
- found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
55
- if found_pdfs:
56
- st.write(f"The keyword '{keyword}' was found in the following PDF files:")
57
- for pdf in found_pdfs:
58
- st.write(f"- {pdf}")
 
 
 
 
 
 
 
 
 
 
59
  else:
60
- st.write(f"The keyword '{keyword}' was not found in any PDFs.")
61
  else:
62
- st.error("Please provide both the folder path and the keyword.")
 
1
  import os
2
+ from transformers import pipeline
3
  import streamlit as st
 
4
  from PyPDF2 import PdfReader
 
 
5
 
6
+ # Initialize the Hugging Face model pipeline
7
+ @st.cache(hash_funcs={pipeline: lambda _: None}) # Allow caching without hashing the model
8
+ def load_model():
9
+ return pipeline("text-classification", model="fajjos/pdf_model")
10
 
11
+ # Extract text from a PDF file
12
+ def extract_text_from_pdf(pdf_path):
 
 
 
 
 
 
 
 
 
 
 
13
  text = ""
14
+ try:
15
+ reader = PdfReader(pdf_path)
16
+ for page in reader.pages:
17
+ if page.extract_text(): # Ensure text is not None
18
+ text += page.extract_text()
19
+ except Exception as e:
20
+ st.error(f"Error reading {pdf_path}: {e}")
21
  return text
22
 
23
+ # Search for the keyword in PDF files
24
+ def search_keyword_in_pdfs(folder_path, keyword, model):
25
+ pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
26
+ matched_files = []
27
+
28
+ for pdf_file in pdf_files:
29
+ pdf_path = os.path.join(folder_path, pdf_file)
30
+ text = extract_text_from_pdf(pdf_path)
31
+
32
+ if text and keyword.lower() in text.lower(): # Case-insensitive search
33
+ # Use the Hugging Face model for additional validation or relevance
34
+ try:
35
+ result = model(text)
36
+ if any(keyword.lower() in res["label"].lower() for res in result):
37
+ matched_files.append(pdf_file)
38
+ except Exception as e:
39
+ st.error(f"Error processing {pdf_file} with the model: {e}")
40
+ return matched_files
41
+
42
+ # Streamlit App UI
43
+ st.title("PDF Keyword Search")
44
+
45
+ # User Inputs
46
+ folder_path = st.text_input("Enter the folder path:")
47
+ keyword = st.text_input("Enter the keyword to search:")
48
+
49
+ # Button to perform the search
50
+ if st.button("Search PDFs"):
51
+ if os.path.isdir(folder_path):
52
+ if keyword:
53
+ st.info("Searching... Please wait.")
54
+ model = load_model() # Load the model
55
+ matched_files = search_keyword_in_pdfs(folder_path, keyword, model)
56
+
57
+ if matched_files:
58
+ st.success(f"Found the keyword '{keyword}' in the following PDF(s):")
59
+ for file in matched_files:
60
+ st.write(f"- {file}")
61
+ else:
62
+ st.warning(f"No PDFs found with the keyword '{keyword}'.")
63
  else:
64
+ st.error("Please enter a keyword.")
65
  else:
66
+ st.error("Invalid folder path. Please enter a valid path.")