RAG-Vereine

Sleeping

App Files Files Community

pratikshahp commited on Jul 10, 2024

Commit

d189514

verified ·

1 Parent(s): d46d62b

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -53

app.py CHANGED Viewed

@@ -5,30 +5,33 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from langchain.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
-# Initialize the model and tokenizer
 model_name = "openai-community/gpt2"
 # model_name = "google/gemma-2-9b"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)  # , use_auth_token=hf_api_key
 def get_llm_response(input_prompt, content, prompt):
     combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
     inputs = tokenizer(combined_input, return_tensors="pt")
     outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract the answer part from the response
     answer_start = response.find("Answer:") + len("Answer:")
     answer = response[answer_start:].strip()
     return answer
 # Function to extract text from PDF file
 def extract_text_from_pdf(file):
     try:
@@ -40,54 +43,62 @@ def extract_text_from_pdf(file):
     except Exception as e:
         return f"Error occurred while reading PDF file: {e}"
-def process_pdf_and_answer_question(pdf_file, question):
-    # Extract text from uploaded PDF file
-    pdf_text = extract_text_from_pdf(pdf_file)
-    if not pdf_text or "Error occurred" in pdf_text:
-        return pdf_text
-    try:
-        # Create embeddings
-        embeddings = HuggingFaceEmbeddings()
-        # Split text into chunks
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=20,
-            length_function=len,
-            is_separator_regex=False,
-        )
-        chunks = text_splitter.create_documents([pdf_text])
-        # Store chunks in ChromaDB
-        persist_directory = 'pdf_embeddings'
-        vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
-        vectordb.persist()  # Persist ChromaDB
-        # Load persisted Chroma database
-        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-        # Perform question answering
-        if question:
-            docs = vectordb.similarity_search(question)
-            text = docs[0].page_content
-            input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
-            response = get_llm_response(input_prompt, text, question)
-            return response
-        else:
-            return "Please provide a valid question."
-    except Exception as e:
-        return f"Error occurred during text processing: {e}"
-# Create Gradio interface
-iface = gr.Interface(
-    fn=process_pdf_and_answer_question,
-    inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")],
-    outputs="text",
-    title="PDF Chatbot",
-    description="Upload a PDF file and ask questions about its content."
-)
 if __name__ == "__main__":
-    iface.launch()

 from langchain.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
+# hf_api_key = os.getenv("HF_TOKEN")
 model_name = "openai-community/gpt2"
 # model_name = "google/gemma-2-9b"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)  # ,use_auth_token=hf_api_key)
 def get_llm_response(input_prompt, content, prompt):
     combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
     inputs = tokenizer(combined_input, return_tensors="pt")
     outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract the answer part from the response
     answer_start = response.find("Answer:") + len("Answer:")
     answer = response[answer_start:].strip()
     return answer
 # Function to extract text from PDF file
 def extract_text_from_pdf(file):
     try:
     except Exception as e:
         return f"Error occurred while reading PDF file: {e}"
+def process_pdf(uploaded_file, prompt):
+    if uploaded_file is not None:
+        # Extract text from uploaded PDF file
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        if pdf_text:
+            try:
+                # Create embeddings
+                embeddings = HuggingFaceEmbeddings()
+                # Split text into chunks
+                text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=1000,
+                    chunk_overlap=20,
+                    length_function=len,
+                    is_separator_regex=False,
+                )
+                chunks = text_splitter.create_documents([pdf_text])
+                # Store chunks in ChromaDB
+                persist_directory = 'pdf_embeddings'
+                vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings,
+                                                 persist_directory=persist_directory)
+                vectordb.persist()  # Persist ChromaDB
+                # Load persisted Chroma database
+                vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+                # Perform question answering
+                if prompt:
+                    docs = vectordb.similarity_search(prompt)
+                    if docs:
+                        text = docs[0].page_content
+                        input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
+                        response = get_llm_response(input_prompt, text, prompt)
+                        return response
+                    else:
+                        return "No relevant documents found."
+                else:
+                    return "Please enter a question."
+            except Exception as e:
+                return f"Error occurred during text processing: {e}"
+    else:
+        return "Please upload a PDF file."
+def main():
+    gr.Interface(
+        fn=process_pdf,
+        inputs=[gr.components.File(type="file", label="Upload PDF File"),
+                gr.components.Textbox(lines=2, placeholder="Ask a Question")],
+        outputs="text",
+        title="PDF Chatbot",
+        description="Upload a PDF file and ask questions about its content."
+    ).launch()
 if __name__ == "__main__":
+    main()