fajjos commited on
Commit
72c8ec0
·
verified ·
1 Parent(s): be4b0cf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ from PyPDF2 import PdfReader
5
+ import torch
6
+ from typing import List
7
+
8
+ # Load the model and tokenizer from Hugging Face
9
+ model_name = "fajjos/pdf_model" # Replace with your model name
10
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+
13
+ # Function to extract text from a single PDF
14
+ def extract_text_from_pdf(pdf_file: str) -> str:
15
+ """
16
+ Extracts text from a single PDF file using PyPDF2.
17
+ """
18
+ pdf_reader = PdfReader(pdf_file)
19
+ text = ""
20
+ for page in pdf_reader.pages:
21
+ text += page.extract_text()
22
+ return text
23
+
24
+ # Function to search for a keyword in the extracted PDF texts
25
+ def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
26
+ """
27
+ Search for the keyword in the uploaded PDFs and return the list of PDF names.
28
+ """
29
+ found_pdfs = []
30
+ for pdf_name, pdf_text in pdf_texts.items():
31
+ prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
32
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
33
+ outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
34
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
35
+
36
+ # If keyword is found in the response
37
+ if keyword.lower() in response.lower():
38
+ found_pdfs.append(pdf_name)
39
+ return found_pdfs
40
+
41
+ # Function to process all PDFs in a specified folder
42
+ def process_pdfs_in_folder(folder_path: str) -> dict:
43
+ """
44
+ Extracts text from all PDFs in the specified folder and stores it in a dictionary.
45
+ """
46
+ pdf_texts = {}
47
+ for file_name in os.listdir(folder_path):
48
+ if file_name.endswith(".pdf"): # Check if the file is a PDF
49
+ file_path = os.path.join(folder_path, file_name)
50
+ pdf_texts[file_name] = extract_text_from_pdf(file_path)
51
+ return pdf_texts
52
+
53
+ # Streamlit UI for folder path and keyword input
54
+ st.title("PDF Keyword Search")
55
+
56
+ folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
57
+ keyword = st.text_input("Enter the keyword to search for:")
58
+
59
+ if st.button("Search"):
60
+ if not folder_path or not keyword:
61
+ st.error("Please provide both the folder path and the keyword.")
62
+ else:
63
+ try:
64
+ # Process all PDFs in the folder
65
+ pdf_texts = process_pdfs_in_folder(folder_path)
66
+
67
+ # Perform keyword search in the extracted texts
68
+ found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
69
+
70
+ # Display results
71
+ if found_pdfs:
72
+ st.write(f"The keyword '{keyword}' was found in the following PDF files:")
73
+ for pdf in found_pdfs:
74
+ st.write(f"- {pdf}")
75
+ else:
76
+ st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
77
+ except Exception as e:
78
+ st.error(f"Error: {e}")