import h5py class HDF5TextManager: def __init__(self,file_name, chunk_size=1000): self.chunk_size = chunk_size self.file_name=file_name def save(self, sentences): chunks = [ sentences[i:i + self.chunk_size] for i in range(0, len(sentences), self.chunk_size) ] with h5py.File(self.file_name, 'w') as hdf: # Create a dataset for storing sentence chunks dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk) dataset = hdf.create_dataset('text_data', shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk dtype=dtype, # Use the string dtype for encoding UTF-8 strings maxshape=max_shape, # Unlimited size along the first axis chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time) compression="gzip") # # Write each chunk of sentences into the dataset for i, chunk in enumerate(chunks): padded_chunk = chunk + [""] * (self.chunk_size - len(chunk)) dataset[i] = padded_chunk print(f"Text data has been stored in {self.file_name} with sentence chunks.") def combine_hdf5_files(self, hdf5_files): with h5py.File(self.file_name, 'a') as combined_h5f: for hdf5_file in hdf5_files: with h5py.File(hdf5_file, 'r') as individual_h5f: group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name group = combined_h5f.create_group(group_name) for dataset_name in individual_h5f.keys(): data = individual_h5f[dataset_name][:] dt = h5py.special_dtype(vlen=str) group.create_dataset(dataset_name, data=data, dtype=dt) print(f"All HDF5 files combined into {self.file_name}")