Spaces:

fajjos
/

pdf_model

Sleeping

App Files Files Community

fajjos commited on Dec 9, 2024

Commit

a491234

verified ·

1 Parent(s): 72c8ec0

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -55

app.py CHANGED Viewed

@@ -1,78 +1,62 @@
 import os
 import streamlit as st
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from PyPDF2 import PdfReader
 import torch
-from typing import List
-# Load the model and tokenizer from Hugging Face
-model_name = "fajjos/pdf_model"  # Replace with your model name
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Function to extract text from a single PDF
 def extract_text_from_pdf(pdf_file: str) -> str:
-    """
-    Extracts text from a single PDF file using PyPDF2.
-    """
     pdf_reader = PdfReader(pdf_file)
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
     return text
-# Function to search for a keyword in the extracted PDF texts
-def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
-    """
-    Search for the keyword in the uploaded PDFs and return the list of PDF names.
-    """
     found_pdfs = []
-    for pdf_name, pdf_text in pdf_texts.items():
-        prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
-        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-        outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # If keyword is found in the response
-        if keyword.lower() in response.lower():
-            found_pdfs.append(pdf_name)
-    return found_pdfs
-# Function to process all PDFs in a specified folder
-def process_pdfs_in_folder(folder_path: str) -> dict:
-    """
-    Extracts text from all PDFs in the specified folder and stores it in a dictionary.
-    """
-    pdf_texts = {}
     for file_name in os.listdir(folder_path):
-        if file_name.endswith(".pdf"):  # Check if the file is a PDF
             file_path = os.path.join(folder_path, file_name)
-            pdf_texts[file_name] = extract_text_from_pdf(file_path)
-    return pdf_texts
-# Streamlit UI for folder path and keyword input
-st.title("PDF Keyword Search")
-folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
 keyword = st.text_input("Enter the keyword to search for:")
 if st.button("Search"):
-    if not folder_path or not keyword:
-        st.error("Please provide both the folder path and the keyword.")
     else:
-        try:
-            # Process all PDFs in the folder
-            pdf_texts = process_pdfs_in_folder(folder_path)
-            # Perform keyword search in the extracted texts
-            found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
-            # Display results
-            if found_pdfs:
-                st.write(f"The keyword '{keyword}' was found in the following PDF files:")
-                for pdf in found_pdfs:
-                    st.write(f"- {pdf}")
-            else:
-                st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
-        except Exception as e:
-            st.error(f"Error: {e}")

 import os
 import streamlit as st
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from PyPDF2 import PdfReader
 import torch
+import bitsandbytes as bnb  # For 4-bit quantization
+# Device configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the tokenizer and the quantized LLaMA model
+model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    load_in_4bit=True,    # Enable 4-bit quantization
+    device_map="auto" if device == "cuda" else {"": "cpu"}
+)
+# Extract text from a PDF
 def extract_text_from_pdf(pdf_file: str) -> str:
     pdf_reader = PdfReader(pdf_file)
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
     return text
+# Function to search for a keyword in PDFs
+def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
     found_pdfs = []
     for file_name in os.listdir(folder_path):
+        if file_name.endswith(".pdf"):
             file_path = os.path.join(folder_path, file_name)
+            pdf_text = extract_text_from_pdf(file_path)
+            # Prepare prompt for model
+            prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}"  # Limiting input size for performance
+            inputs = tokenizer(prompt, return_tensors="pt").to(device)
+            with torch.no_grad():
+                output = model.generate(**inputs, max_new_tokens=200)
+            response = tokenizer.decode(output[0], skip_special_tokens=True)
+            if keyword.lower() in response.lower():
+                found_pdfs.append(file_name)
+    return found_pdfs
+# Streamlit interface
+st.title("PDF Keyword Search with LLaMA 4-bit Model")
+folder_path = st.text_input("Enter the folder path containing PDFs:")
 keyword = st.text_input("Enter the keyword to search for:")
 if st.button("Search"):
+    if folder_path and keyword:
+        found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
+        if found_pdfs:
+            st.write(f"The keyword '{keyword}' was found in the following PDF files:")
+            for pdf in found_pdfs:
+                st.write(f"- {pdf}")
+        else:
+            st.write(f"The keyword '{keyword}' was not found in any PDFs.")
     else:
+        st.error("Please provide both the folder path and the keyword.")