Spaces:

NEXAS
/

multimodal

Running

App Files Files Community

multimodal / src /utils /ingest_text.py

NEXAS

Update src/utils/ingest_text.py

cb7d229 verified 20 days ago

raw

history blame

3.28 kB

	import os
	import pickle
	from typing import List
	from llama_parse import LlamaParse
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders.directory import DirectoryLoader
	from langchain_community.document_loaders import TextLoader
	from langchain_community.vectorstores.qdrant import Qdrant
	from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
	import nltk
	import nest_asyncio

	# Setup
	nltk.download('punkt')
	nest_asyncio.apply()

	# Load environment variables
	from dotenv import load_dotenv
	load_dotenv()

	# Environment keys
	llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
	groq_api_key = os.getenv("GROQ_API_KEY")

	# Paths
	parsed_data_file = os.path.join("data", "parsed_data.pkl")
	output_md = os.path.join("data", "output.md")
	md_directory = "data"
	collection_name = "rag"

	# Helper: Load or parse PDF
	def load_or_parse_data(pdf_path):
	if os.path.exists(parsed_data_file):
	with open(parsed_data_file, "rb") as f:
	parsed_data = pickle.load(f)
	else:
	parsing_instruction = """The provided document is a user guide or manual.
	It contains many images and tables. Be precise while answering questions."""
	parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore
	parsed_data = parser.load_data(pdf_path)

	with open(parsed_data_file, "wb") as f:
	pickle.dump(parsed_data, f)

	return parsed_data

	# Main vector DB builder
	def create_vector_database(pdf_path):
	print("🧠 Starting vector DB creation...")

	parsed_docs = load_or_parse_data(pdf_path)
	if not parsed_docs:
	raise ValueError("❌ No parsed documents returned from LlamaParse!")

	os.makedirs(md_directory, exist_ok=True)

	# Write Markdown content to file (overwrite)
	with open(output_md, 'w', encoding='utf-8') as f:
	for doc in parsed_docs:
	if hasattr(doc, "text") and doc.text.strip():
	f.write(doc.text.strip() + "\n\n")

	# Ensure .md file was written
	if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
	raise RuntimeError("❌ Markdown file was not created or is empty!")

	# Load documents
	try:
	loader = DirectoryLoader(md_directory, glob="*/.md", show_progress=True)
	documents = loader.load()
	except Exception as e:
	print("⚠️ DirectoryLoader failed, falling back to TextLoader...")
	documents = TextLoader(output_md, encoding='utf-8').load()

	if not documents:
	raise RuntimeError("❌ No documents loaded from markdown!")

	# Split documents
	splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
	docs = splitter.split_documents(documents)
	print(f"✅ Loaded and split {len(docs)} chunks.")

	# Embedding
	embeddings = FastEmbedEmbeddings() # type: ignore

	# Create vector store
	print("📦 Creating Qdrant vector DB...")
	qdrant = Qdrant.from_documents(
	documents=docs,
	embedding=embeddings,
	path=os.path.join("data", "local_qdrant"),
	collection_name=collection_name,
	)

	print("✅ Vector DB created successfully.")
	return qdrant