Spaces:
Running
Running
import os | |
import pickle | |
from typing import List | |
from llama_parse import LlamaParse | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders.directory import DirectoryLoader | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.vectorstores.qdrant import Qdrant | |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings | |
import nltk | |
import nest_asyncio | |
# Setup | |
nltk.download('punkt') | |
nest_asyncio.apply() | |
# Load environment variables | |
from dotenv import load_dotenv | |
load_dotenv() | |
# Environment keys | |
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") | |
groq_api_key = os.getenv("GROQ_API_KEY") | |
# Paths | |
parsed_data_file = os.path.join("data", "parsed_data.pkl") | |
output_md = os.path.join("data", "output.md") | |
md_directory = "data" | |
collection_name = "rag" | |
# Helper: Load or parse PDF | |
def load_or_parse_data(pdf_path): | |
if os.path.exists(parsed_data_file): | |
with open(parsed_data_file, "rb") as f: | |
parsed_data = pickle.load(f) | |
else: | |
parsing_instruction = """The provided document is a user guide or manual. | |
It contains many images and tables. Be precise while answering questions.""" | |
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore | |
parsed_data = parser.load_data(pdf_path) | |
with open(parsed_data_file, "wb") as f: | |
pickle.dump(parsed_data, f) | |
return parsed_data | |
# Main vector DB builder | |
def create_vector_database(pdf_path): | |
print("π§ Starting vector DB creation...") | |
parsed_docs = load_or_parse_data(pdf_path) | |
if not parsed_docs: | |
raise ValueError("β No parsed documents returned from LlamaParse!") | |
os.makedirs(md_directory, exist_ok=True) | |
# Write Markdown content to file (overwrite) | |
with open(output_md, 'w', encoding='utf-8') as f: | |
for doc in parsed_docs: | |
if hasattr(doc, "text") and doc.text.strip(): | |
f.write(doc.text.strip() + "\n\n") | |
# Ensure .md file was written | |
if not os.path.exists(output_md) or os.path.getsize(output_md) == 0: | |
raise RuntimeError("β Markdown file was not created or is empty!") | |
# Load documents | |
try: | |
loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True) | |
documents = loader.load() | |
except Exception as e: | |
print("β οΈ DirectoryLoader failed, falling back to TextLoader...") | |
documents = TextLoader(output_md, encoding='utf-8').load() | |
if not documents: | |
raise RuntimeError("β No documents loaded from markdown!") | |
# Split documents | |
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
docs = splitter.split_documents(documents) | |
print(f"β Loaded and split {len(docs)} chunks.") | |
# Embedding | |
embeddings = FastEmbedEmbeddings() # type: ignore | |
# Create vector store | |
print("π¦ Creating Qdrant vector DB...") | |
qdrant = Qdrant.from_documents( | |
documents=docs, | |
embedding=embeddings, | |
path=os.path.join("data", "local_qdrant"), | |
collection_name=collection_name, | |
) | |
print("β Vector DB created successfully.") | |
return qdrant | |