am_text_summary / train /data_loader.py
berito's picture
train code added
a608bb4
raw
history blame
1.98 kB
import h5py
import re
class DataLoader:
def __init__(self,file_name, chunk_size=10):
self.file_name=file_name
self.chunk_size=chunk_size
def load_txt(self):
with open(self.file_name+".txt", 'r', encoding='utf-8') as file:
text_data = file.read()
delimiters = r'፡፡|::|:|\.|\?|!'
# Split into sentences
sentences = re.split(delimiters,text_data)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
return sentences
def load_hdf5(self,start_index=0):
dataset_name = 'text_data'
with h5py.File(self.file_name+'.h5', 'r') as hdf:
dataset = hdf[dataset_name]
if isinstance(dataset, h5py.Dataset):
if hasattr(dataset, 'shape'):
total_chunks = dataset.shape[0] # Number of chunks in the dataset
sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences)
else:
raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.")
else:
raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.")
end_index = min(start_index + self.chunk_size, total_chunks)
chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences
chunk_data_decoded = []
for chunk in chunk_data:
chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]])
return total_chunks,chunk_data_decoded, end_index
def save_fast_text_file(self,file_name,sentences):
print("i am here")
with open(file_name, "w", encoding="utf-8") as f:
for sentence in sentences:
sentence = sentence.strip()
if not sentence.endswith("።"):
sentence += "።"
f.write(sentence + "\n")