RAG-Vereine

Sleeping

App Files Files Community

pratikshahp commited on Jul 10, 2024

Commit

c29df11

verified ·

1 Parent(s): 7d2ccd7

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import gradio as gr
+import fitz  # PyMuPDF
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from langchain.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Initialize the model and tokenizer
+model_name = "openai-community/gpt2"
+# model_name = "google/gemma-2-9b"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)  # , use_auth_token=hf_api_key
+def get_llm_response(input_prompt, content, prompt):
+    combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
+    inputs = tokenizer(combined_input, return_tensors="pt")
+    outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract the answer part from the response
+    answer_start = response.find("Answer:") + len("Answer:")
+    answer = response[answer_start:].strip()
+    return answer
+# Function to extract text from PDF file
+def extract_text_from_pdf(file):
+    try:
+        doc = fitz.open(stream=file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    except Exception as e:
+        return f"Error occurred while reading PDF file: {e}"
+def process_pdf_and_answer_question(pdf_file, question):
+    # Extract text from uploaded PDF file
+    pdf_text = extract_text_from_pdf(pdf_file)
+    if not pdf_text or "Error occurred" in pdf_text:
+        return pdf_text
+    try:
+        # Create embeddings
+        embeddings = HuggingFaceEmbeddings()
+        # Split text into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=20,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        chunks = text_splitter.create_documents([pdf_text])
+        # Store chunks in ChromaDB
+        persist_directory = 'pdf_embeddings'
+        vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
+        vectordb.persist()  # Persist ChromaDB
+        # Load persisted Chroma database
+        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+        # Perform question answering
+        if question:
+            docs = vectordb.similarity_search(question)
+            text = docs[0].page_content
+            input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
+            response = get_llm_response(input_prompt, text, question)
+            return response
+        else:
+            return "Please provide a valid question."
+    except Exception as e:
+        return f"Error occurred during text processing: {e}"
+# Create Gradio interface
+iface = gr.Interface(
+    fn=process_pdf_and_answer_question,
+    inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")],
+    outputs="text",
+    title="PDF Chatbot",
+    description="Upload a PDF file and ask questions about its content."
+)
+if __name__ == "__main__":
+    iface.launch()