File size: 2,341 Bytes
a608bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import h5py


class HDF5TextManager:
    def __init__(self,file_name, chunk_size=1000):
        self.chunk_size = chunk_size
        self.file_name=file_name
    def save(self, sentences):
        chunks = [
            sentences[i:i + self.chunk_size]
            for i in range(0, len(sentences), self.chunk_size)
        ]
        with h5py.File(self.file_name, 'w') as hdf:
            # Create a dataset for storing sentence chunks
            dtype = h5py.string_dtype(encoding='utf-8')  # Use UTF-8 encoding for the sentences
            max_shape = (None, self.chunk_size)  # Unlimited size for dynamic growth
            chunk_shape = (1,self.chunk_size)  # Define chunk size (number of sentences per chunk)
            
            dataset = hdf.create_dataset('text_data', 
                                        shape=(len(chunks), self.chunk_size),  # Number of chunks and sentences per chunk
                                        dtype=dtype,  # Use the string dtype for encoding UTF-8 strings
                                        maxshape=max_shape,  # Unlimited size along the first axis
                                        chunks=chunk_shape,#(1, sentences_per_chunk),  # Chunking along the first axis (1 chunk at a time)
                                        compression="gzip")  #
            # Write each chunk of sentences into the dataset
            for i, chunk in enumerate(chunks):
               padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
               dataset[i] = padded_chunk
        print(f"Text data has been stored in {self.file_name} with sentence chunks.")


    def combine_hdf5_files(self, hdf5_files):
        with h5py.File(self.file_name, 'a') as combined_h5f:
            for hdf5_file in hdf5_files:
                with h5py.File(hdf5_file, 'r') as individual_h5f:
                    group_name = hdf5_file.split('.')[0]  # Use filename (without extension) as group name
                    group = combined_h5f.create_group(group_name)
                    for dataset_name in individual_h5f.keys():
                        data = individual_h5f[dataset_name][:]
                        dt = h5py.special_dtype(vlen=str)
                        group.create_dataset(dataset_name, data=data, dtype=dt)
        print(f"All HDF5 files combined into {self.file_name}")