File size: 1,081 Bytes
826253d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
026aeba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from langchain.text_splitter import RecursiveCharacterTextSplitter
import hashlib

def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = []
    seen_hashes = set()  # Track hashes of chunks to avoid duplicates

    for data in dataset:
        text_list = data['documents']
        for text in text_list:
            chunks = text_splitter.split_text(text)
            for i, chunk in enumerate(chunks):
                # Generate a unique hash for the chunk
                chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
                
                # Skip if the chunk is a duplicate
                if chunk_hash in seen_hashes:
                    continue
                
                # Add the chunk to the documents list and track its hash
                documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
                seen_hashes.add(chunk_hash)
    
    return documents