am_text_summary / train /hdf5_file_manager.py
berito's picture
train code added
a608bb4
raw
history blame
2.34 kB
import h5py
class HDF5TextManager:
def __init__(self,file_name, chunk_size=1000):
self.chunk_size = chunk_size
self.file_name=file_name
def save(self, sentences):
chunks = [
sentences[i:i + self.chunk_size]
for i in range(0, len(sentences), self.chunk_size)
]
with h5py.File(self.file_name, 'w') as hdf:
# Create a dataset for storing sentence chunks
dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences
max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth
chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk)
dataset = hdf.create_dataset('text_data',
shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk
dtype=dtype, # Use the string dtype for encoding UTF-8 strings
maxshape=max_shape, # Unlimited size along the first axis
chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time)
compression="gzip") #
# Write each chunk of sentences into the dataset
for i, chunk in enumerate(chunks):
padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
dataset[i] = padded_chunk
print(f"Text data has been stored in {self.file_name} with sentence chunks.")
def combine_hdf5_files(self, hdf5_files):
with h5py.File(self.file_name, 'a') as combined_h5f:
for hdf5_file in hdf5_files:
with h5py.File(hdf5_file, 'r') as individual_h5f:
group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name
group = combined_h5f.create_group(group_name)
for dataset_name in individual_h5f.keys():
data = individual_h5f[dataset_name][:]
dt = h5py.special_dtype(vlen=str)
group.create_dataset(dataset_name, data=data, dtype=dt)
print(f"All HDF5 files combined into {self.file_name}")