Spaces:

berito
/

am_text_summary

Sleeping

App Files Files Community

am_text_summary / train /data_loader.py

berito

train code added

a608bb4 5 months ago

raw

history blame

1.98 kB

	import h5py
	import re
	class DataLoader:
	def __init__(self,file_name, chunk_size=10):
	self.file_name=file_name
	self.chunk_size=chunk_size
	def load_txt(self):
	with open(self.file_name+".txt", 'r', encoding='utf-8') as file:
	text_data = file.read()
	delimiters = r'፡፡\|::\|:\|\.\|\?\|!'
	# Split into sentences
	sentences = re.split(delimiters,text_data)
	sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
	return sentences
	def load_hdf5(self,start_index=0):
	dataset_name = 'text_data'
	with h5py.File(self.file_name+'.h5', 'r') as hdf:
	dataset = hdf[dataset_name]
	if isinstance(dataset, h5py.Dataset):
	if hasattr(dataset, 'shape'):
	total_chunks = dataset.shape[0] # Number of chunks in the dataset
	sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences)
	else:
	raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.")
	else:
	raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.")
	end_index = min(start_index + self.chunk_size, total_chunks)
	chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences
	chunk_data_decoded = []
	for chunk in chunk_data:
	chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]])
	return total_chunks,chunk_data_decoded, end_index
	def save_fast_text_file(self,file_name,sentences):
	print("i am here")
	with open(file_name, "w", encoding="utf-8") as f:
	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence.endswith("።"):
	sentence += "።"
	f.write(sentence + "\n")