File size: 2,731 Bytes
5f04412
 
 
 
 
 
 
 
 
c947c47
5f04412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581bada
5f04412
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import logging, os, sys

from llama_hub.youtube_transcript import YoutubeTranscriptReader
from llama_index import download_loader, PromptTemplate
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

from pathlib import Path
from pymongo import MongoClient

PDF_URL       = "https://arxiv.org/pdf/2303.08774.pdf"
WEB_URL       = "https://openai.com/research/gpt-4"
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"

MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"]
MONGODB_DB_NAME           = "llamaindex_db"
MONGODB_COLLECTION_NAME   = "gpt-4"
MONGODB_INDEX_NAME        = "default"

logging.basicConfig(stream = sys.stdout, level = logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream = sys.stdout))

def load_documents():
    docs = []
    
    # PDF
    PDFReader = download_loader("PDFReader")
    loader = PDFReader()
    out_dir = Path("data")
    
    if not out_dir.exists():
        os.makedirs(out_dir)
    
    out_path = out_dir / "gpt-4.pdf"
    
    if not out_path.exists():
        r = requests.get(PDF_URL)
        with open(out_path, "wb") as f:
            f.write(r.content)

    docs.extend(loader.load_data(file = Path(out_path)))
    #print("docs = " + str(len(docs)))
    
    # Web
    SimpleWebPageReader = download_loader("SimpleWebPageReader")
    loader = SimpleWebPageReader()
    docs.extend(loader.load_data(urls = [WEB_URL]))
    #print("docs = " + str(len(docs)))

    # YouTube
    loader = YoutubeTranscriptReader()
    docs.extend(loader.load_data(ytlinks = [YOUTUBE_URL_1,
                                            YOUTUBE_URL_2]))
    #print("docs = " + str(len(docs)))
    
    return docs

def store_documents(config, docs):
    storage_context = StorageContext.from_defaults(
        vector_store = get_vector_store())
    
    VectorStoreIndex.from_documents(
        docs,
        storage_context = storage_context
    )

def get_vector_store():
    return MongoDBAtlasVectorSearch(
        MongoClient(MONGODB_ATLAS_CLUSTER_URI),
        db_name = MONGODB_DB_NAME,
        collection_name = MONGODB_COLLECTION_NAME,
        index_name = MONGODB_INDEX_NAME
    )

def rag_ingestion_llamaindex(config):
    docs = load_documents()
    
    store_documents(config, docs)

def rag_retrieval(config, prompt):
    index = VectorStoreIndex.from_vector_store(
        vector_store = get_vector_store())

    query_engine = index.as_query_engine(
        similarity_top_k = config["k"]
    )
 
    return query_engine.query(prompt)