Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 5, 2024

Commit

a42468d

verified ·

1 Parent(s): 85852c7

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -27

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import io
 import fitz  # PyMuPDF
 import PyPDF2
 from docx import Document
-from dotenv import load_dotenv
 import streamlit as st
 from sentence_transformers import SentenceTransformer
 from langchain.prompts import PromptTemplate
@@ -13,16 +12,13 @@ from langchain_community.vectorstores.faiss import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFaceEndpoint
-# Load environment variables from .env file
-load_dotenv()
 # Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Initialize the HuggingFace LLM
 llm = HuggingFaceEndpoint(
     endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
-    model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
 )
 # Initialize the HuggingFace embeddings
@@ -32,14 +28,6 @@ embedding = HuggingFaceEmbeddings()
 st.set_page_config(layout="centered")
 st.markdown("<h1 style='font-size:24px;'>PDF and DOCX ChatBot</h1>", unsafe_allow_html=True)
-# Retrieve API key from environment variable
-google_api_key = os.getenv("GOOGLE_API_KEY")
-# Check if the API key is available
-if google_api_key is None:
-    st.warning("API key not found. Please set the google_api_key environment variable.")
-    st.stop()
 # File Upload
 uploaded_file = st.file_uploader("Upload your PDF or DOCX file", type=["pdf", "docx"])
@@ -82,21 +70,21 @@ Question:\n{question}\n
 Answer:
 """
-def extract_text_from_docx(docx_path):
     text = ""
     try:
-        doc = Document(docx_path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
         print(f"Error extracting text from DOCX: {e}")
     return text
-def extract_text_from_pdf(pdf_path):
     text = ""
     try:
-        pdf_document = fitz.open(pdf_path)
-        for page_num in range(pdf_document.page_count):
-            page = pdf_document.load_page(page_num)
             text += page.get_text()
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
@@ -109,18 +97,13 @@ if uploaded_file is not None:
     # Process the uploaded file
     if uploaded_file.name.endswith('.pdf'):
-        pdf_data = uploaded_file.read()
-        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
-        pdf_pages = pdf_reader.pages
-        context = "\n\n".join(page.extract_text() for page in pdf_pages)
     elif uploaded_file.name.endswith('.docx'):
-        docx_data = uploaded_file.read()
-        context = extract_text_from_docx(io.BytesIO(docx_data))
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
     texts = text_splitter.split_text(context)
-    embeddings = HuggingFaceEmbeddings()
-    vector_index = FAISS.from_texts(texts, embeddings).as_retriever()
     user_question = st.text_input("Ask Anything from the Document:", "")

 import fitz  # PyMuPDF
 import PyPDF2
 from docx import Document
 import streamlit as st
 from sentence_transformers import SentenceTransformer
 from langchain.prompts import PromptTemplate
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFaceEndpoint
 # Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Initialize the HuggingFace LLM
 llm = HuggingFaceEndpoint(
     endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
+    model_kwargs={"api_key": "YOUR_HUGGINGFACE_API_KEY"}
 )
 # Initialize the HuggingFace embeddings
 st.set_page_config(layout="centered")
 st.markdown("<h1 style='font-size:24px;'>PDF and DOCX ChatBot</h1>", unsafe_allow_html=True)
 # File Upload
 uploaded_file = st.file_uploader("Upload your PDF or DOCX file", type=["pdf", "docx"])
 Answer:
 """
+def extract_text_from_docx(docx_file):
     text = ""
     try:
+        doc = Document(docx_file)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
         print(f"Error extracting text from DOCX: {e}")
     return text
+def extract_text_from_pdf(pdf_file):
     text = ""
     try:
+        pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
             text += page.get_text()
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
     # Process the uploaded file
     if uploaded_file.name.endswith('.pdf'):
+        context = extract_text_from_pdf(uploaded_file)
     elif uploaded_file.name.endswith('.docx'):
+        context = extract_text_from_docx(uploaded_file)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
     texts = text_splitter.split_text(context)
+    vector_index = FAISS.from_texts(texts, embedding).as_retriever()
     user_question = st.text_input("Ask Anything from the Document:", "")