Spaces:

fajjos
/

pdf_model

Sleeping

App Files Files Community

pdf_model / app.py

fajjos

Create app.py

72c8ec0 verified 7 months ago

raw

history blame

3.01 kB

	import os
	import streamlit as st
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	from PyPDF2 import PdfReader
	import torch
	from typing import List

	# Load the model and tokenizer from Hugging Face
	model_name = "fajjos/pdf_model" # Replace with your model name
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Function to extract text from a single PDF
	def extract_text_from_pdf(pdf_file: str) -> str:
	"""
	Extracts text from a single PDF file using PyPDF2.
	"""
	pdf_reader = PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	# Function to search for a keyword in the extracted PDF texts
	def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
	"""
	Search for the keyword in the uploaded PDFs and return the list of PDF names.
	"""
	found_pdfs = []
	for pdf_name, pdf_text in pdf_texts.items():
	prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
	inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
	outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# If keyword is found in the response
	if keyword.lower() in response.lower():
	found_pdfs.append(pdf_name)
	return found_pdfs

	# Function to process all PDFs in a specified folder
	def process_pdfs_in_folder(folder_path: str) -> dict:
	"""
	Extracts text from all PDFs in the specified folder and stores it in a dictionary.
	"""
	pdf_texts = {}
	for file_name in os.listdir(folder_path):
	if file_name.endswith(".pdf"): # Check if the file is a PDF
	file_path = os.path.join(folder_path, file_name)
	pdf_texts[file_name] = extract_text_from_pdf(file_path)
	return pdf_texts

	# Streamlit UI for folder path and keyword input
	st.title("PDF Keyword Search")

	folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
	keyword = st.text_input("Enter the keyword to search for:")

	if st.button("Search"):
	if not folder_path or not keyword:
	st.error("Please provide both the folder path and the keyword.")
	else:
	try:
	# Process all PDFs in the folder
	pdf_texts = process_pdfs_in_folder(folder_path)

	# Perform keyword search in the extracted texts
	found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)

	# Display results
	if found_pdfs:
	st.write(f"The keyword '{keyword}' was found in the following PDF files:")
	for pdf in found_pdfs:
	st.write(f"- {pdf}")
	else:
	st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
	except Exception as e:
	st.error(f"Error: {e}")