add pdf files
Browse files
utils.py
CHANGED
@@ -7,7 +7,7 @@ from langchain import HuggingFaceHub
|
|
7 |
from langchain.cache import InMemoryCache
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
-
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
|
11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
12 |
from langchain.memory import ConversationBufferWindowMemory
|
13 |
from langchain.prompts.chat import (
|
@@ -151,6 +151,10 @@ def search_index_from_docs(source_chunks):
|
|
151 |
return search_index
|
152 |
|
153 |
|
|
|
|
|
|
|
|
|
154 |
def get_html_files():
|
155 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
156 |
document_list = loader.load()
|
@@ -160,6 +164,7 @@ def get_html_files():
|
|
160 |
def fetch_data_for_embeddings():
|
161 |
document_list = get_text_files()
|
162 |
document_list.extend(get_html_files())
|
|
|
163 |
|
164 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
165 |
for document in document_list:
|
|
|
7 |
from langchain.cache import InMemoryCache
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
+
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
|
11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
12 |
from langchain.memory import ConversationBufferWindowMemory
|
13 |
from langchain.prompts.chat import (
|
|
|
151 |
return search_index
|
152 |
|
153 |
|
154 |
+
def get_pdf_files():
|
155 |
+
loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
|
156 |
+
document_list = loader.load()
|
157 |
+
return document_list
|
158 |
def get_html_files():
|
159 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
160 |
document_list = loader.load()
|
|
|
164 |
def fetch_data_for_embeddings():
|
165 |
document_list = get_text_files()
|
166 |
document_list.extend(get_html_files())
|
167 |
+
document_list.extend(get_pdf_files())
|
168 |
|
169 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
170 |
for document in document_list:
|