Spaces:
Running
Running
import h5py | |
class HDF5TextManager: | |
def __init__(self,file_name, chunk_size=1000): | |
self.chunk_size = chunk_size | |
self.file_name=file_name | |
def save(self, sentences): | |
chunks = [ | |
sentences[i:i + self.chunk_size] | |
for i in range(0, len(sentences), self.chunk_size) | |
] | |
with h5py.File(self.file_name, 'w') as hdf: | |
# Create a dataset for storing sentence chunks | |
dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences | |
max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth | |
chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk) | |
dataset = hdf.create_dataset('text_data', | |
shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk | |
dtype=dtype, # Use the string dtype for encoding UTF-8 strings | |
maxshape=max_shape, # Unlimited size along the first axis | |
chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time) | |
compression="gzip") # | |
# Write each chunk of sentences into the dataset | |
for i, chunk in enumerate(chunks): | |
padded_chunk = chunk + [""] * (self.chunk_size - len(chunk)) | |
dataset[i] = padded_chunk | |
print(f"Text data has been stored in {self.file_name} with sentence chunks.") | |
def combine_hdf5_files(self, hdf5_files): | |
with h5py.File(self.file_name, 'a') as combined_h5f: | |
for hdf5_file in hdf5_files: | |
with h5py.File(hdf5_file, 'r') as individual_h5f: | |
group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name | |
group = combined_h5f.create_group(group_name) | |
for dataset_name in individual_h5f.keys(): | |
data = individual_h5f[dataset_name][:] | |
dt = h5py.special_dtype(vlen=str) | |
group.create_dataset(dataset_name, data=data, dtype=dt) | |
print(f"All HDF5 files combined into {self.file_name}") | |