Spaces:

fajjos
/

pdf_model

Sleeping

App Files Files Community

fajjos commited on Dec 10, 2024

Commit

c4accb1

verified ·

1 Parent(s): a491234

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -52

app.py CHANGED Viewed

@@ -1,62 +1,66 @@
 import os
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from PyPDF2 import PdfReader
-import torch
-import bitsandbytes as bnb  # For 4-bit quantization
-# Device configuration
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the tokenizer and the quantized LLaMA model
-model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    load_in_4bit=True,    # Enable 4-bit quantization
-    device_map="auto" if device == "cuda" else {"": "cpu"}
-)
-# Extract text from a PDF
-def extract_text_from_pdf(pdf_file: str) -> str:
-    pdf_reader = PdfReader(pdf_file)
     text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
     return text
-# Function to search for a keyword in PDFs
-def search_keyword_in_pdfs(keyword: str, folder_path: str) -> list:
-    found_pdfs = []
-    for file_name in os.listdir(folder_path):
-        if file_name.endswith(".pdf"):
-            file_path = os.path.join(folder_path, file_name)
-            pdf_text = extract_text_from_pdf(file_path)
-            # Prepare prompt for model
-            prompt = f"Check if the keyword '{keyword}' appears in this text:\n{pdf_text[:1000]}"  # Limiting input size for performance
-            inputs = tokenizer(prompt, return_tensors="pt").to(device)
-            with torch.no_grad():
-                output = model.generate(**inputs, max_new_tokens=200)
-            response = tokenizer.decode(output[0], skip_special_tokens=True)
-            if keyword.lower() in response.lower():
-                found_pdfs.append(file_name)
-    return found_pdfs
-# Streamlit interface
-st.title("PDF Keyword Search with LLaMA 4-bit Model")
-folder_path = st.text_input("Enter the folder path containing PDFs:")
-keyword = st.text_input("Enter the keyword to search for:")
-if st.button("Search"):
-    if folder_path and keyword:
-        found_pdfs = search_keyword_in_pdfs(keyword, folder_path)
-        if found_pdfs:
-            st.write(f"The keyword '{keyword}' was found in the following PDF files:")
-            for pdf in found_pdfs:
-                st.write(f"- {pdf}")
         else:
-            st.write(f"The keyword '{keyword}' was not found in any PDFs.")
     else:
-        st.error("Please provide both the folder path and the keyword.")

 import os
+from transformers import pipeline
 import streamlit as st
 from PyPDF2 import PdfReader
+# Initialize the Hugging Face model pipeline
+@st.cache(hash_funcs={pipeline: lambda _: None})  # Allow caching without hashing the model
+def load_model():
+    return pipeline("text-classification", model="fajjos/pdf_model")
+# Extract text from a PDF file
+def extract_text_from_pdf(pdf_path):
     text = ""
+    try:
+        reader = PdfReader(pdf_path)
+        for page in reader.pages:
+            if page.extract_text():  # Ensure text is not None
+                text += page.extract_text()
+    except Exception as e:
+        st.error(f"Error reading {pdf_path}: {e}")
     return text
+# Search for the keyword in PDF files
+def search_keyword_in_pdfs(folder_path, keyword, model):
+    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
+    matched_files = []
+    for pdf_file in pdf_files:
+        pdf_path = os.path.join(folder_path, pdf_file)
+        text = extract_text_from_pdf(pdf_path)
+        if text and keyword.lower() in text.lower():  # Case-insensitive search
+            # Use the Hugging Face model for additional validation or relevance
+            try:
+                result = model(text)
+                if any(keyword.lower() in res["label"].lower() for res in result):
+                    matched_files.append(pdf_file)
+            except Exception as e:
+                st.error(f"Error processing {pdf_file} with the model: {e}")
+    return matched_files
+# Streamlit App UI
+st.title("PDF Keyword Search")
+# User Inputs
+folder_path = st.text_input("Enter the folder path:")
+keyword = st.text_input("Enter the keyword to search:")
+# Button to perform the search
+if st.button("Search PDFs"):
+    if os.path.isdir(folder_path):
+        if keyword:
+            st.info("Searching... Please wait.")
+            model = load_model()  # Load the model
+            matched_files = search_keyword_in_pdfs(folder_path, keyword, model)
+            if matched_files:
+                st.success(f"Found the keyword '{keyword}' in the following PDF(s):")
+                for file in matched_files:
+                    st.write(f"- {file}")
+            else:
+                st.warning(f"No PDFs found with the keyword '{keyword}'.")
         else:
+            st.error("Please enter a keyword.")
     else:
+        st.error("Invalid folder path. Please enter a valid path.")