import h5py import re class DataLoader: def __init__(self,file_name, chunk_size=10): self.file_name=file_name self.chunk_size=chunk_size def load_txt(self): with open(self.file_name+".txt", 'r', encoding='utf-8') as file: text_data = file.read() delimiters = r'፡፡|::|:|\.|\?|!' # Split into sentences sentences = re.split(delimiters,text_data) sentences = [sentence.strip() for sentence in sentences if sentence.strip()] return sentences def load_hdf5(self,start_index=0): dataset_name = 'text_data' with h5py.File(self.file_name+'.h5', 'r') as hdf: dataset = hdf[dataset_name] if isinstance(dataset, h5py.Dataset): if hasattr(dataset, 'shape'): total_chunks = dataset.shape[0] # Number of chunks in the dataset sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences) else: raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.") else: raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.") end_index = min(start_index + self.chunk_size, total_chunks) chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences chunk_data_decoded = [] for chunk in chunk_data: chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]]) return total_chunks,chunk_data_decoded, end_index def save_fast_text_file(self,file_name,sentences): print("i am here") with open(file_name, "w", encoding="utf-8") as f: for sentence in sentences: sentence = sentence.strip() if not sentence.endswith("።"): sentence += "።" f.write(sentence + "\n")