rohan13 commited on
Commit
e0086ee
·
1 Parent(s): 9d48540

add pdf files

Browse files
Files changed (1) hide show
  1. utils.py +6 -1
utils.py CHANGED
@@ -7,7 +7,7 @@ from langchain import HuggingFaceHub
7
  from langchain.cache import InMemoryCache
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.chat_models import ChatOpenAI
10
- from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
11
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
12
  from langchain.memory import ConversationBufferWindowMemory
13
  from langchain.prompts.chat import (
@@ -151,6 +151,10 @@ def search_index_from_docs(source_chunks):
151
  return search_index
152
 
153
 
 
 
 
 
154
  def get_html_files():
155
  loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
156
  document_list = loader.load()
@@ -160,6 +164,7 @@ def get_html_files():
160
  def fetch_data_for_embeddings():
161
  document_list = get_text_files()
162
  document_list.extend(get_html_files())
 
163
 
164
  # use file_url_mapping to set metadata of document to url which has been set as the source
165
  for document in document_list:
 
7
  from langchain.cache import InMemoryCache
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.chat_models import ChatOpenAI
10
+ from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
11
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
12
  from langchain.memory import ConversationBufferWindowMemory
13
  from langchain.prompts.chat import (
 
151
  return search_index
152
 
153
 
154
+ def get_pdf_files():
155
+ loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
156
+ document_list = loader.load()
157
+ return document_list
158
  def get_html_files():
159
  loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
160
  document_list = loader.load()
 
164
  def fetch_data_for_embeddings():
165
  document_list = get_text_files()
166
  document_list.extend(get_html_files())
167
+ document_list.extend(get_pdf_files())
168
 
169
  # use file_url_mapping to set metadata of document to url which has been set as the source
170
  for document in document_list: