Spaces:
Running
Running
import h5py | |
import re | |
class DataLoader: | |
def __init__(self,file_name, chunk_size=10): | |
self.file_name=file_name | |
self.chunk_size=chunk_size | |
def load_txt(self): | |
with open(self.file_name+".txt", 'r', encoding='utf-8') as file: | |
text_data = file.read() | |
delimiters = r'፡፡|::|:|\.|\?|!' | |
# Split into sentences | |
sentences = re.split(delimiters,text_data) | |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] | |
return sentences | |
def load_hdf5(self,start_index=0): | |
dataset_name = 'text_data' | |
with h5py.File(self.file_name+'.h5', 'r') as hdf: | |
dataset = hdf[dataset_name] | |
if isinstance(dataset, h5py.Dataset): | |
if hasattr(dataset, 'shape'): | |
total_chunks = dataset.shape[0] # Number of chunks in the dataset | |
sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences) | |
else: | |
raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.") | |
else: | |
raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.") | |
end_index = min(start_index + self.chunk_size, total_chunks) | |
chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences | |
chunk_data_decoded = [] | |
for chunk in chunk_data: | |
chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]]) | |
return total_chunks,chunk_data_decoded, end_index | |
def save_fast_text_file(self,file_name,sentences): | |
print("i am here") | |
with open(file_name, "w", encoding="utf-8") as f: | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if not sentence.endswith("።"): | |
sentence += "።" | |
f.write(sentence + "\n") |