Spaces:
Build error
Build error
File size: 2,959 Bytes
5f04412 c947c47 5f04412 44a693f e946a29 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 e946a29 0ddb69a 5f04412 0ddb69a 5f04412 e946a29 0ddb69a 5f04412 e946a29 0ddb69a 5f04412 0ddb69a 5f04412 e946a29 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import logging, os, sys
from llama_hub.youtube_transcript import YoutubeTranscriptReader
from llama_index import download_loader, PromptTemplate
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from pathlib import Path
from pymongo import MongoClient
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
WEB_URL = "https://openai.com/research/gpt-4"
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"]
MONGODB_DB_NAME = "llamaindex_db"
MONGODB_COLLECTION_NAME = "gpt-4"
MONGODB_INDEX_NAME = "default"
logging.basicConfig(stream = sys.stdout, level = logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream = sys.stdout))
class LlamaIndexRAG:
def load_documents(self):
docs = []
# PDF
PDFReader = download_loader("PDFReader")
loader = PDFReader()
out_dir = Path("data")
if not out_dir.exists():
os.makedirs(out_dir)
out_path = out_dir / "gpt-4.pdf"
if not out_path.exists():
r = requests.get(PDF_URL)
with open(out_path, "wb") as f:
f.write(r.content)
docs.extend(loader.load_data(file = Path(out_path)))
#print("docs = " + str(len(docs)))
# Web
SimpleWebPageReader = download_loader("SimpleWebPageReader")
loader = SimpleWebPageReader()
docs.extend(loader.load_data(urls = [WEB_URL]))
#print("docs = " + str(len(docs)))
# YouTube
loader = YoutubeTranscriptReader()
docs.extend(loader.load_data(ytlinks = [YOUTUBE_URL_1,
YOUTUBE_URL_2]))
#print("docs = " + str(len(docs)))
return docs
def store_documents(self, config, docs):
storage_context = StorageContext.from_defaults(
vector_store = get_vector_store())
VectorStoreIndex.from_documents(
docs,
storage_context = storage_context
)
def get_vector_store(self):
return MongoDBAtlasVectorSearch(
MongoClient(MONGODB_ATLAS_CLUSTER_URI),
db_name = MONGODB_DB_NAME,
collection_name = MONGODB_COLLECTION_NAME,
index_name = MONGODB_INDEX_NAME
)
def ingestion(self, config):
docs = load_documents()
store_documents(config, docs)
def retrieval(self, config, prompt):
index = VectorStoreIndex.from_vector_store(
vector_store = get_vector_store())
query_engine = index.as_query_engine(
similarity_top_k = config["k"]
)
return query_engine.query(prompt) |