fajjos commited on
Commit
a491234
·
verified ·
1 Parent(s): 72c8ec0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -55
app.py CHANGED
@@ -1,78 +1,62 @@
1
  import os
2
  import streamlit as st
3
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from PyPDF2 import PdfReader
5
  import torch
6
- from typing import List
7
 
8
- # Load the model and tokenizer from Hugging Face
9
- model_name = "fajjos/pdf_model" # Replace with your model name
10
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
- # Function to extract text from a single PDF
 
 
 
 
 
 
14
  def extract_text_from_pdf(pdf_file: str) -> str:
15
- """
16
- Extracts text from a single PDF file using PyPDF2.
17
- """
18
  pdf_reader = PdfReader(pdf_file)
19
  text = ""
20
  for page in pdf_reader.pages:
21
  text += page.extract_text()
22
  return text
23
 
24
- # Function to search for a keyword in the extracted PDF texts
25
- def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
26
- """
27
- Search for the keyword in the uploaded PDFs and return the list of PDF names.
28
- """
29
  found_pdfs = []
30
- for pdf_name, pdf_text in pdf_texts.items():
31
- prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
32
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
33
- outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
34
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
35
-
36
- # If keyword is found in the response
37
- if keyword.lower() in response.lower():
38
- found_pdfs.append(pdf_name)
39
- return found_pdfs
40
-
41
- # Function to process all PDFs in a specified folder
42
- def process_pdfs_in_folder(folder_path: str) -> dict:
43
- """
44
- Extracts text from all PDFs in the specified folder and stores it in a dictionary.
45
- """
46
- pdf_texts = {}
47
  for file_name in os.listdir(folder_path):
48
- if file_name.endswith(".pdf"): # Check if the file is a PDF
49
  file_path = os.path.join(folder_path, file_name)
50
- pdf_texts[file_name] = extract_text_from_pdf(file_path)
51
- return pdf_texts
 
 
 
 
 
 
 
 
52
 
53
- # Streamlit UI for folder path and keyword input
54
- st.title("PDF Keyword Search")
55
 
56
- folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
57
  keyword = st.text_input("Enter the keyword to search for:")
58
 
59
  if st.button("Search"):
60
- if not folder_path or not keyword:
61
- st.error("Please provide both the folder path and the keyword.")
 
 
 
 
 
 
62
  else:
63
- try:
64
- # Process all PDFs in the folder
65
- pdf_texts = process_pdfs_in_folder(folder_path)
66
-
67
- # Perform keyword search in the extracted texts
68
- found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
69
-
70
- # Display results
71
- if found_pdfs:
72
- st.write(f"The keyword '{keyword}' was found in the following PDF files:")
73
- for pdf in found_pdfs:
74
- st.write(f"- {pdf}")
75
- else:
76
- st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
77
- except Exception as e:
78
- st.error(f"Error: {e}")
 
1
  import os
2
  import streamlit as st
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from PyPDF2 import PdfReader
5
  import torch
6
+ import bitsandbytes as bnb # For 4-bit quantization
7
 
8
+ # Device configuration
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ # Load the tokenizer and the quantized LLaMA model
12
+ model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
13
  tokenizer = AutoTokenizer.from_pretrained(model_name)
14
 
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ model_name,
17
+ load_in_4bit=True, # Enable 4-bit quantization
18
+ device_map="auto" if device == "cuda" else {"": "cpu"}
19
+ )
20
+
21
+ # Extract text from a PDF
22
  def extract_text_from_pdf(pdf_file: str) -> str:
 
 
 
23
  pdf_reader = PdfReader(pdf_file)
24
  text = ""
25
  for page in pdf_reader.pages:
26
  text += page.extract_text()
27
  return text
28
 
29
+ # Function to search for a keyword in PDFs
30
+ def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
 
 
 
31
  found_pdfs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  for file_name in os.listdir(folder_path):
33
+ if file_name.endswith(".pdf"):
34
  file_path = os.path.join(folder_path, file_name)
35
+ pdf_text = extract_text_from_pdf(file_path)
36
+ # Prepare prompt for model
37
+ prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}" # Limiting input size for performance
38
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
39
+ with torch.no_grad():
40
+ output = model.generate(**inputs, max_new_tokens=200)
41
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
42
+ if keyword.lower() in response.lower():
43
+ found_pdfs.append(file_name)
44
+ return found_pdfs
45
 
46
+ # Streamlit interface
47
+ st.title("PDF Keyword Search with LLaMA 4-bit Model")
48
 
49
+ folder_path = st.text_input("Enter the folder path containing PDFs:")
50
  keyword = st.text_input("Enter the keyword to search for:")
51
 
52
  if st.button("Search"):
53
+ if folder_path and keyword:
54
+ found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
55
+ if found_pdfs:
56
+ st.write(f"The keyword '{keyword}' was found in the following PDF files:")
57
+ for pdf in found_pdfs:
58
+ st.write(f"- {pdf}")
59
+ else:
60
+ st.write(f"The keyword '{keyword}' was not found in any PDFs.")
61
  else:
62
+ st.error("Please provide both the folder path and the keyword.")