rohan13 commited on
Commit
6e37923
1 Parent(s): 9eb0222
Files changed (4) hide show
  1. mapping.py +3 -1
  2. models/openai_vs.pkl +2 -2
  3. requirements.txt +2 -1
  4. utils.py +6 -1
mapping.py CHANGED
@@ -116,4 +116,6 @@ FILE_URL_MAPPING = {
116
  'https://www.coursera.org/learn/3d-printing-revolution/supplement/HZXB5/module-1-overview',
117
 
118
  'docs/02_module-1-what-is-3d-printing/02_3d-printing-insights/07_what-would-you-make-exercise_peer_assignment_instructions.html':
119
- 'https://www.coursera.org/learn/3d-printing-revolution/peer/t8bqq/what-would-you-make-exercise'}
 
 
 
116
  'https://www.coursera.org/learn/3d-printing-revolution/supplement/HZXB5/module-1-overview',
117
 
118
  'docs/02_module-1-what-is-3d-printing/02_3d-printing-insights/07_what-would-you-make-exercise_peer_assignment_instructions.html':
119
+ 'https://www.coursera.org/learn/3d-printing-revolution/peer/t8bqq/what-would-you-make-exercise',
120
+ 'docs/020 3DP Trend report 2023_DEF_BB.pdf':'https://www.hubs.com/get/trends/'
121
+ }
models/openai_vs.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:807e1e7a285df094ec49ec67f44438d8a300e017fc1290073dd4c60432c9473b
3
- size 2513084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba431f53906d67789b21e6b7646f8cb526818db6270f29f427e6ed03e4c42b9
3
+ size 3029176
requirements.txt CHANGED
@@ -8,4 +8,5 @@ gtts
8
  torch
9
  tiktoken
10
  huggingface-hub
11
- pymongo
 
 
8
  torch
9
  tiktoken
10
  huggingface-hub
11
+ pymongo
12
+ pypdf
utils.py CHANGED
@@ -7,7 +7,7 @@ from langchain import HuggingFaceHub
7
  from langchain.cache import InMemoryCache
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.chat_models import ChatOpenAI
10
- from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
11
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
12
  from langchain.memory import ConversationBufferWindowMemory
13
  from langchain.prompts.chat import (
@@ -149,6 +149,10 @@ def search_index_from_docs(source_chunks):
149
  return search_index
150
 
151
 
 
 
 
 
152
  def get_html_files():
153
  loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
154
  document_list = loader.load()
@@ -158,6 +162,7 @@ def get_html_files():
158
  def fetch_data_for_embeddings():
159
  document_list = get_text_files()
160
  document_list.extend(get_html_files())
 
161
 
162
  # use file_url_mapping to set metadata of document to url which has been set as the source
163
  for document in document_list:
 
7
  from langchain.cache import InMemoryCache
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.chat_models import ChatOpenAI
10
+ from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
11
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
12
  from langchain.memory import ConversationBufferWindowMemory
13
  from langchain.prompts.chat import (
 
149
  return search_index
150
 
151
 
152
+ def get_pdf_files():
153
+ loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
154
+ document_list = loader.load()
155
+ return document_list
156
  def get_html_files():
157
  loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
158
  document_list = loader.load()
 
162
  def fetch_data_for_embeddings():
163
  document_list = get_text_files()
164
  document_list.extend(get_html_files())
165
+ document_list.extend(get_pdf_files())
166
 
167
  # use file_url_mapping to set metadata of document to url which has been set as the source
168
  for document in document_list: