Spaces:

berito
/

am_text_summary

Sleeping

App Files Files Community

am_text_summary / train /hdf5_file_manager.py

berito

train code added

a608bb4 5 months ago

raw

history blame

2.34 kB

	import h5py


	class HDF5TextManager:
	def __init__(self,file_name, chunk_size=1000):
	self.chunk_size = chunk_size
	self.file_name=file_name
	def save(self, sentences):
	chunks = [
	sentences[i:i + self.chunk_size]
	for i in range(0, len(sentences), self.chunk_size)
	]
	with h5py.File(self.file_name, 'w') as hdf:
	# Create a dataset for storing sentence chunks
	dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences
	max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth
	chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk)

	dataset = hdf.create_dataset('text_data',
	shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk
	dtype=dtype, # Use the string dtype for encoding UTF-8 strings
	maxshape=max_shape, # Unlimited size along the first axis
	chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time)
	compression="gzip") #
	# Write each chunk of sentences into the dataset
	for i, chunk in enumerate(chunks):
	padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
	dataset[i] = padded_chunk
	print(f"Text data has been stored in {self.file_name} with sentence chunks.")


	def combine_hdf5_files(self, hdf5_files):
	with h5py.File(self.file_name, 'a') as combined_h5f:
	for hdf5_file in hdf5_files:
	with h5py.File(hdf5_file, 'r') as individual_h5f:
	group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name
	group = combined_h5f.create_group(group_name)
	for dataset_name in individual_h5f.keys():
	data = individual_h5f[dataset_name][:]
	dt = h5py.special_dtype(vlen=str)
	group.create_dataset(dataset_name, data=data, dtype=dt)
	print(f"All HDF5 files combined into {self.file_name}")