Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from langchain.vectorstores import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Initialize the model and tokenizer | |
model_name = "openai-community/gpt2" | |
# model_name = "google/gemma-2-9b" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) # , use_auth_token=hf_api_key | |
def get_llm_response(input_prompt, content, prompt): | |
combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:" | |
inputs = tokenizer(combined_input, return_tensors="pt") | |
outputs = model.generate(**inputs, max_length=400, num_return_sequences=1) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Extract the answer part from the response | |
answer_start = response.find("Answer:") + len("Answer:") | |
answer = response[answer_start:].strip() | |
return answer | |
# Function to extract text from PDF file | |
def extract_text_from_pdf(file): | |
try: | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
except Exception as e: | |
return f"Error occurred while reading PDF file: {e}" | |
def process_pdf_and_answer_question(pdf_file, question): | |
# Extract text from uploaded PDF file | |
pdf_text = extract_text_from_pdf(pdf_file) | |
if not pdf_text or "Error occurred" in pdf_text: | |
return pdf_text | |
try: | |
# Create embeddings | |
embeddings = HuggingFaceEmbeddings() | |
# Split text into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=20, | |
length_function=len, | |
is_separator_regex=False, | |
) | |
chunks = text_splitter.create_documents([pdf_text]) | |
# Store chunks in ChromaDB | |
persist_directory = 'pdf_embeddings' | |
vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory) | |
vectordb.persist() # Persist ChromaDB | |
# Load persisted Chroma database | |
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
# Perform question answering | |
if question: | |
docs = vectordb.similarity_search(question) | |
text = docs[0].page_content | |
input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file." | |
response = get_llm_response(input_prompt, text, question) | |
return response | |
else: | |
return "Please provide a valid question." | |
except Exception as e: | |
return f"Error occurred during text processing: {e}" | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_pdf_and_answer_question, | |
inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")], | |
outputs="text", | |
title="PDF Chatbot", | |
description="Upload a PDF file and ask questions about its content." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |