multimodal / src /utils /ingest_text.py
NEXAS's picture
Update src/utils/ingest_text.py
cb7d229 verified
raw
history blame
3.28 kB
import os
import pickle
from typing import List
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
import nltk
import nest_asyncio
# Setup
nltk.download('punkt')
nest_asyncio.apply()
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
# Environment keys
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
# Paths
parsed_data_file = os.path.join("data", "parsed_data.pkl")
output_md = os.path.join("data", "output.md")
md_directory = "data"
collection_name = "rag"
# Helper: Load or parse PDF
def load_or_parse_data(pdf_path):
if os.path.exists(parsed_data_file):
with open(parsed_data_file, "rb") as f:
parsed_data = pickle.load(f)
else:
parsing_instruction = """The provided document is a user guide or manual.
It contains many images and tables. Be precise while answering questions."""
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore
parsed_data = parser.load_data(pdf_path)
with open(parsed_data_file, "wb") as f:
pickle.dump(parsed_data, f)
return parsed_data
# Main vector DB builder
def create_vector_database(pdf_path):
print("🧠 Starting vector DB creation...")
parsed_docs = load_or_parse_data(pdf_path)
if not parsed_docs:
raise ValueError("❌ No parsed documents returned from LlamaParse!")
os.makedirs(md_directory, exist_ok=True)
# Write Markdown content to file (overwrite)
with open(output_md, 'w', encoding='utf-8') as f:
for doc in parsed_docs:
if hasattr(doc, "text") and doc.text.strip():
f.write(doc.text.strip() + "\n\n")
# Ensure .md file was written
if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
raise RuntimeError("❌ Markdown file was not created or is empty!")
# Load documents
try:
loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True)
documents = loader.load()
except Exception as e:
print("⚠️ DirectoryLoader failed, falling back to TextLoader...")
documents = TextLoader(output_md, encoding='utf-8').load()
if not documents:
raise RuntimeError("❌ No documents loaded from markdown!")
# Split documents
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
docs = splitter.split_documents(documents)
print(f"βœ… Loaded and split {len(docs)} chunks.")
# Embedding
embeddings = FastEmbedEmbeddings() # type: ignore
# Create vector store
print("πŸ“¦ Creating Qdrant vector DB...")
qdrant = Qdrant.from_documents(
documents=docs,
embedding=embeddings,
path=os.path.join("data", "local_qdrant"),
collection_name=collection_name,
)
print("βœ… Vector DB created successfully.")
return qdrant