bstraehle commited on
Commit
340e058
·
1 Parent(s): 88eb4f9

Update rag.py

Browse files
Files changed (1) hide show
  1. rag.py +25 -19
rag.py CHANGED
@@ -21,10 +21,9 @@ PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
21
  WEB_URL = "https://openai.com/research/gpt-4"
22
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
23
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
24
- YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
25
 
26
- YOUTUBE_DIR = "/data/youtube"
27
- CHROMA_DIR = "/data/chroma"
28
 
29
  MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"]
30
  MONGODB_DB_NAME = "langchain_db"
@@ -37,45 +36,52 @@ RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"], tem
37
  client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)
38
  collection = client[MONGODB_DB_NAME][MONGODB_COLLECTION_NAME]
39
 
40
- def document_loading_splitting():
41
- # Document loading
42
  docs = []
43
 
44
- # Load PDF
45
  loader = PyPDFLoader(PDF_URL)
46
  docs.extend(loader.load())
47
 
48
- # Load Web
49
  loader = WebBaseLoader(WEB_URL)
50
  docs.extend(loader.load())
51
 
52
- # Load YouTube
53
  loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
54
- YOUTUBE_URL_2,
55
- YOUTUBE_URL_3], YOUTUBE_DIR),
56
  OpenAIWhisperParser())
57
  docs.extend(loader.load())
58
 
59
- # Document splitting
 
 
60
  text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = config["chunk_overlap"],
61
  chunk_size = config["chunk_size"])
62
- split_documents = text_splitter.split_documents(docs)
63
 
64
- return split_documents
65
-
66
- def document_storage_chroma(documents):
67
- Chroma.from_documents(documents = documents,
68
  embedding = OpenAIEmbeddings(disallowed_special = ()),
69
  persist_directory = CHROMA_DIR)
70
 
71
- def document_storage_mongodb(documents):
72
- MongoDBAtlasVectorSearch.from_documents(documents = documents,
73
  embedding = OpenAIEmbeddings(disallowed_special = ()),
74
  collection = collection,
75
  index_name = MONGODB_INDEX_NAME)
76
 
 
 
 
 
 
 
 
 
77
  def document_retrieval_chroma():
78
- return Chroma(embedding_function = OpenAIEmbeddings(),
79
  persist_directory = CHROMA_DIR)
80
 
81
  def document_retrieval_mongodb():
 
21
  WEB_URL = "https://openai.com/research/gpt-4"
22
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
23
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
 
24
 
25
+ YOUTUBE_DIR = "/data/yt"
26
+ CHROMA_DIR = "/data/db"
27
 
28
  MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"]
29
  MONGODB_DB_NAME = "langchain_db"
 
36
  client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)
37
  collection = client[MONGODB_DB_NAME][MONGODB_COLLECTION_NAME]
38
 
39
+ def document_loading():
 
40
  docs = []
41
 
42
+ # PDF
43
  loader = PyPDFLoader(PDF_URL)
44
  docs.extend(loader.load())
45
 
46
+ # Web
47
  loader = WebBaseLoader(WEB_URL)
48
  docs.extend(loader.load())
49
 
50
+ # YouTube
51
  loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
52
+ YOUTUBE_URL_2], YOUTUBE_DIR),
 
53
  OpenAIWhisperParser())
54
  docs.extend(loader.load())
55
 
56
+ returns docs
57
+
58
+ def document_splitting(config, docs):
59
  text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = config["chunk_overlap"],
60
  chunk_size = config["chunk_size"])
 
61
 
62
+ return text_splitter.split_documents(docs)
63
+
64
+ def document_storage_chroma(chunks):
65
+ Chroma.from_documents(documents = chunks,
66
  embedding = OpenAIEmbeddings(disallowed_special = ()),
67
  persist_directory = CHROMA_DIR)
68
 
69
+ def document_storage_mongodb(chunks):
70
+ MongoDBAtlasVectorSearch.from_documents(documents = chunks,
71
  embedding = OpenAIEmbeddings(disallowed_special = ()),
72
  collection = collection,
73
  index_name = MONGODB_INDEX_NAME)
74
 
75
+ def rag_batch(config):
76
+ docs = document_loading()
77
+
78
+ chunks = document_splitting(config, docs)
79
+
80
+ document_storage_chroma(chunks)
81
+ document_storage_mongodb(chunks)
82
+
83
  def document_retrieval_chroma():
84
+ return Chroma(embedding_function = OpenAIEmbeddings(disallowed_special = ()),
85
  persist_directory = CHROMA_DIR)
86
 
87
  def document_retrieval_mongodb():