import os import pickle from typing import List from llama_parse import LlamaParse from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders.directory import DirectoryLoader from langchain_community.document_loaders import TextLoader from langchain_community.vectorstores.qdrant import Qdrant from langchain_community.embeddings.fastembed import FastEmbedEmbeddings import nltk import nest_asyncio # Setup nltk.download('punkt') nest_asyncio.apply() # Load environment variables from dotenv import load_dotenv load_dotenv() # Environment keys llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") groq_api_key = os.getenv("GROQ_API_KEY") # Paths parsed_data_file = os.path.join("data", "parsed_data.pkl") output_md = os.path.join("data", "output.md") md_directory = "data" collection_name = "rag" # Helper: Load or parse PDF def load_or_parse_data(pdf_path): if os.path.exists(parsed_data_file): with open(parsed_data_file, "rb") as f: parsed_data = pickle.load(f) else: parsing_instruction = """The provided document is a user guide or manual. It contains many images and tables. Be precise while answering questions.""" parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore parsed_data = parser.load_data(pdf_path) with open(parsed_data_file, "wb") as f: pickle.dump(parsed_data, f) return parsed_data # Main vector DB builder def create_vector_database(pdf_path): print("🧠 Starting vector DB creation...") parsed_docs = load_or_parse_data(pdf_path) if not parsed_docs: raise ValueError("❌ No parsed documents returned from LlamaParse!") os.makedirs(md_directory, exist_ok=True) # Write Markdown content to file (overwrite) with open(output_md, 'w', encoding='utf-8') as f: for doc in parsed_docs: if hasattr(doc, "text") and doc.text.strip(): f.write(doc.text.strip() + "\n\n") # Ensure .md file was written if not os.path.exists(output_md) or os.path.getsize(output_md) == 0: raise RuntimeError("❌ Markdown file was not created or is empty!") # Load documents try: loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True) documents = loader.load() except Exception as e: print("⚠️ DirectoryLoader failed, falling back to TextLoader...") documents = TextLoader(output_md, encoding='utf-8').load() if not documents: raise RuntimeError("❌ No documents loaded from markdown!") # Split documents splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) docs = splitter.split_documents(documents) print(f"✅ Loaded and split {len(docs)} chunks.") # Embedding embeddings = FastEmbedEmbeddings() # type: ignore # Create vector store print("📦 Creating Qdrant vector DB...") qdrant = Qdrant.from_documents( documents=docs, embedding=embeddings, path=os.path.join("data", "local_qdrant"), collection_name=collection_name, ) print("✅ Vector DB created successfully.") return qdrant