File size: 3,449 Bytes
c29df11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import fitz  # PyMuPDF
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os 
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize the model and tokenizer
model_name = "openai-community/gpt2"
# model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)  # , use_auth_token=hf_api_key

def get_llm_response(input_prompt, content, prompt):
    combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
    inputs = tokenizer(combined_input, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the answer part from the response
    answer_start = response.find("Answer:") + len("Answer:")
    answer = response[answer_start:].strip()
    
    return answer

# Function to extract text from PDF file
def extract_text_from_pdf(file):
    try:
        doc = fitz.open(stream=file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error occurred while reading PDF file: {e}"

def process_pdf_and_answer_question(pdf_file, question):
    # Extract text from uploaded PDF file
    pdf_text = extract_text_from_pdf(pdf_file)
    
    if not pdf_text or "Error occurred" in pdf_text:
        return pdf_text
    
    try:
        # Create embeddings
        embeddings = HuggingFaceEmbeddings()

        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=20,
            length_function=len,
            is_separator_regex=False,
        )
        chunks = text_splitter.create_documents([pdf_text])

        # Store chunks in ChromaDB
        persist_directory = 'pdf_embeddings'
        vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
        vectordb.persist()  # Persist ChromaDB

        # Load persisted Chroma database
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

        # Perform question answering
        if question:
            docs = vectordb.similarity_search(question)
            text = docs[0].page_content
            input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
            response = get_llm_response(input_prompt, text, question)
            return response
        else:
            return "Please provide a valid question."
    except Exception as e:
        return f"Error occurred during text processing: {e}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf_and_answer_question,
    inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")],
    outputs="text",
    title="PDF Chatbot",
    description="Upload a PDF file and ask questions about its content."
)

if __name__ == "__main__":
    iface.launch()