NaimaAqeel commited on
Commit
a42468d
·
verified ·
1 Parent(s): 85852c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -27
app.py CHANGED
@@ -3,7 +3,6 @@ import io
3
  import fitz # PyMuPDF
4
  import PyPDF2
5
  from docx import Document
6
- from dotenv import load_dotenv
7
  import streamlit as st
8
  from sentence_transformers import SentenceTransformer
9
  from langchain.prompts import PromptTemplate
@@ -13,16 +12,13 @@ from langchain_community.vectorstores.faiss import FAISS
13
  from langchain_community.embeddings import HuggingFaceEmbeddings
14
  from langchain_community.llms import HuggingFaceEndpoint
15
 
16
- # Load environment variables from .env file
17
- load_dotenv()
18
-
19
  # Initialize the embedding model
20
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
21
 
22
  # Initialize the HuggingFace LLM
23
  llm = HuggingFaceEndpoint(
24
  endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
25
- model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
26
  )
27
 
28
  # Initialize the HuggingFace embeddings
@@ -32,14 +28,6 @@ embedding = HuggingFaceEmbeddings()
32
  st.set_page_config(layout="centered")
33
  st.markdown("<h1 style='font-size:24px;'>PDF and DOCX ChatBot</h1>", unsafe_allow_html=True)
34
 
35
- # Retrieve API key from environment variable
36
- google_api_key = os.getenv("GOOGLE_API_KEY")
37
-
38
- # Check if the API key is available
39
- if google_api_key is None:
40
- st.warning("API key not found. Please set the google_api_key environment variable.")
41
- st.stop()
42
-
43
  # File Upload
44
  uploaded_file = st.file_uploader("Upload your PDF or DOCX file", type=["pdf", "docx"])
45
 
@@ -82,21 +70,21 @@ Question:\n{question}\n
82
  Answer:
83
  """
84
 
85
- def extract_text_from_docx(docx_path):
86
  text = ""
87
  try:
88
- doc = Document(docx_path)
89
  text = "\n".join([para.text for para in doc.paragraphs])
90
  except Exception as e:
91
  print(f"Error extracting text from DOCX: {e}")
92
  return text
93
 
94
- def extract_text_from_pdf(pdf_path):
95
  text = ""
96
  try:
97
- pdf_document = fitz.open(pdf_path)
98
- for page_num in range(pdf_document.page_count):
99
- page = pdf_document.load_page(page_num)
100
  text += page.get_text()
101
  except Exception as e:
102
  print(f"Error extracting text from PDF: {e}")
@@ -109,18 +97,13 @@ if uploaded_file is not None:
109
 
110
  # Process the uploaded file
111
  if uploaded_file.name.endswith('.pdf'):
112
- pdf_data = uploaded_file.read()
113
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
114
- pdf_pages = pdf_reader.pages
115
- context = "\n\n".join(page.extract_text() for page in pdf_pages)
116
  elif uploaded_file.name.endswith('.docx'):
117
- docx_data = uploaded_file.read()
118
- context = extract_text_from_docx(io.BytesIO(docx_data))
119
 
120
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
121
  texts = text_splitter.split_text(context)
122
- embeddings = HuggingFaceEmbeddings()
123
- vector_index = FAISS.from_texts(texts, embeddings).as_retriever()
124
 
125
  user_question = st.text_input("Ask Anything from the Document:", "")
126
 
 
3
  import fitz # PyMuPDF
4
  import PyPDF2
5
  from docx import Document
 
6
  import streamlit as st
7
  from sentence_transformers import SentenceTransformer
8
  from langchain.prompts import PromptTemplate
 
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain_community.llms import HuggingFaceEndpoint
14
 
 
 
 
15
  # Initialize the embedding model
16
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
17
 
18
  # Initialize the HuggingFace LLM
19
  llm = HuggingFaceEndpoint(
20
  endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
21
+ model_kwargs={"api_key": "YOUR_HUGGINGFACE_API_KEY"}
22
  )
23
 
24
  # Initialize the HuggingFace embeddings
 
28
  st.set_page_config(layout="centered")
29
  st.markdown("<h1 style='font-size:24px;'>PDF and DOCX ChatBot</h1>", unsafe_allow_html=True)
30
 
 
 
 
 
 
 
 
 
31
  # File Upload
32
  uploaded_file = st.file_uploader("Upload your PDF or DOCX file", type=["pdf", "docx"])
33
 
 
70
  Answer:
71
  """
72
 
73
+ def extract_text_from_docx(docx_file):
74
  text = ""
75
  try:
76
+ doc = Document(docx_file)
77
  text = "\n".join([para.text for para in doc.paragraphs])
78
  except Exception as e:
79
  print(f"Error extracting text from DOCX: {e}")
80
  return text
81
 
82
+ def extract_text_from_pdf(pdf_file):
83
  text = ""
84
  try:
85
+ pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
86
+ for page_num in range(len(pdf_document)):
87
+ page = pdf_document[page_num]
88
  text += page.get_text()
89
  except Exception as e:
90
  print(f"Error extracting text from PDF: {e}")
 
97
 
98
  # Process the uploaded file
99
  if uploaded_file.name.endswith('.pdf'):
100
+ context = extract_text_from_pdf(uploaded_file)
 
 
 
101
  elif uploaded_file.name.endswith('.docx'):
102
+ context = extract_text_from_docx(uploaded_file)
 
103
 
104
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
105
  texts = text_splitter.split_text(context)
106
+ vector_index = FAISS.from_texts(texts, embedding).as_retriever()
 
107
 
108
  user_question = st.text_input("Ask Anything from the Document:", "")
109