pdf files
Browse files- mapping.py +3 -1
- models/openai_vs.pkl +2 -2
- requirements.txt +2 -1
- utils.py +6 -1
mapping.py
CHANGED
@@ -116,4 +116,6 @@ FILE_URL_MAPPING = {
|
|
116 |
'https://www.coursera.org/learn/3d-printing-revolution/supplement/HZXB5/module-1-overview',
|
117 |
|
118 |
'docs/02_module-1-what-is-3d-printing/02_3d-printing-insights/07_what-would-you-make-exercise_peer_assignment_instructions.html':
|
119 |
-
'https://www.coursera.org/learn/3d-printing-revolution/peer/t8bqq/what-would-you-make-exercise'
|
|
|
|
|
|
116 |
'https://www.coursera.org/learn/3d-printing-revolution/supplement/HZXB5/module-1-overview',
|
117 |
|
118 |
'docs/02_module-1-what-is-3d-printing/02_3d-printing-insights/07_what-would-you-make-exercise_peer_assignment_instructions.html':
|
119 |
+
'https://www.coursera.org/learn/3d-printing-revolution/peer/t8bqq/what-would-you-make-exercise',
|
120 |
+
'docs/020 3DP Trend report 2023_DEF_BB.pdf':'https://www.hubs.com/get/trends/'
|
121 |
+
}
|
models/openai_vs.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fba431f53906d67789b21e6b7646f8cb526818db6270f29f427e6ed03e4c42b9
|
3 |
+
size 3029176
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ gtts
|
|
8 |
torch
|
9 |
tiktoken
|
10 |
huggingface-hub
|
11 |
-
pymongo
|
|
|
|
8 |
torch
|
9 |
tiktoken
|
10 |
huggingface-hub
|
11 |
+
pymongo
|
12 |
+
pypdf
|
utils.py
CHANGED
@@ -7,7 +7,7 @@ from langchain import HuggingFaceHub
|
|
7 |
from langchain.cache import InMemoryCache
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
-
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
|
11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
12 |
from langchain.memory import ConversationBufferWindowMemory
|
13 |
from langchain.prompts.chat import (
|
@@ -149,6 +149,10 @@ def search_index_from_docs(source_chunks):
|
|
149 |
return search_index
|
150 |
|
151 |
|
|
|
|
|
|
|
|
|
152 |
def get_html_files():
|
153 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
154 |
document_list = loader.load()
|
@@ -158,6 +162,7 @@ def get_html_files():
|
|
158 |
def fetch_data_for_embeddings():
|
159 |
document_list = get_text_files()
|
160 |
document_list.extend(get_html_files())
|
|
|
161 |
|
162 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
163 |
for document in document_list:
|
|
|
7 |
from langchain.cache import InMemoryCache
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
+
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
|
11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
12 |
from langchain.memory import ConversationBufferWindowMemory
|
13 |
from langchain.prompts.chat import (
|
|
|
149 |
return search_index
|
150 |
|
151 |
|
152 |
+
def get_pdf_files():
|
153 |
+
loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
|
154 |
+
document_list = loader.load()
|
155 |
+
return document_list
|
156 |
def get_html_files():
|
157 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
158 |
document_list = loader.load()
|
|
|
162 |
def fetch_data_for_embeddings():
|
163 |
document_list = get_text_files()
|
164 |
document_list.extend(get_html_files())
|
165 |
+
document_list.extend(get_pdf_files())
|
166 |
|
167 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
168 |
for document in document_list:
|