Spaces:

NEXAS
/

Titan

Sleeping

File size: 3,981 Bytes

03d82bf

import chromadb

import os

# Ingest Text
from llama_parse import LlamaParse
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import os
import pickle
import nest_asyncio

nest_asyncio.apply()

path = "mm_vdb2"
client = chromadb.PersistentClient(path=path)

llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ"
groq_api_key = "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"

parsed_data_file = r"parsed_data.pkl"
output_md = r"output.md"
loki = r"data"

# Define a function to load parsed data if available, or parse if not
def load_or_parse_data(loc):
    data_file = parsed_data_file

    if os.path.exists(data_file):
        # Load the parsed data from the file
        with open(data_file, "rb") as f:
            parsed_data = pickle.load(f)
    else:
        # Perform the parsing step and store the result in llama_parse_documents
        parsingInstructiontest10k = """The provided document is an user guide or a manual.

        It contains many images and tables.

        Try to be precise while answering the questions"""
        parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)  # type: ignore
        llama_parse_documents = parser.load_data(loc)

        # Save the parsed data to a file
        with open(data_file, "wb") as f:
            pickle.dump(llama_parse_documents, f)

        # Set the parsed data to the variable
        parsed_data = llama_parse_documents

    return parsed_data


# Create vector database
def create_vector_database(loc):
    """

    Creates a vector database using document loaders and embeddings.



    This function loads urls,

    splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,

    and finally persists the embeddings into a Chroma vector database.

    """
    # Call the function to either load or parse the data
    data = loc
    loader = PyMuPDFLoader(file_path=data)
    docs = loader.load()  # This returns a list of pages/documents

    print(f"Number of documents: {len(docs)}")

    print("Vector DB started!")

    # Initialize a list for document content and IDs
    document_contents = []
    ids = []

    # Generate unique IDs for each document, with PDF page number first
    for i, doc in enumerate(docs):
        # Print metadata to understand its structure
        print(f"Metadata for document {i+1}: {doc.metadata}")

        # Try to extract the page number from metadata or use a default
        page_num = doc.metadata.get('page_number', f'unknown_{i+1}')  # Use i+1 to ensure uniqueness

        # Extract text from each page
        page_content = doc.page_content  # Get the content of the page

        # Split the content into chunks based on the text splitter
        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
        doc_chunks = text_splitter.split_text(page_content)

        # Add chunk contents and corresponding page-based IDs
        for chunk_idx, chunk in enumerate(doc_chunks):
            document_contents.append(chunk)  # Add the chunk content
            ids.append(f"page_{page_num}_chunk_{i+1}_{chunk_idx+1}")  # Add a unique chunk ID

    # Ensure the number of ids matches the number of documents (contents)
    assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents"

    # Create or get the text collection
    text_collection = client.get_or_create_collection(name="text_collection")

    # Add documents and their embeddings to the collection
    text_collection.add(
        documents=document_contents,  # All the chunk-level content
        ids=ids  # Matching IDs for each chunk content
    )

    print('Vector DB created successfully!')
    return text_collection