berito commited on
Commit
a608bb4
·
1 Parent(s): 02b2abd

train code added

Browse files
train/data_loader.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import h5py
2
+ import re
3
+ class DataLoader:
4
+ def __init__(self,file_name, chunk_size=10):
5
+ self.file_name=file_name
6
+ self.chunk_size=chunk_size
7
+ def load_txt(self):
8
+ with open(self.file_name+".txt", 'r', encoding='utf-8') as file:
9
+ text_data = file.read()
10
+ delimiters = r'፡፡|::|:|\.|\?|!'
11
+ # Split into sentences
12
+ sentences = re.split(delimiters,text_data)
13
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
14
+ return sentences
15
+ def load_hdf5(self,start_index=0):
16
+ dataset_name = 'text_data'
17
+ with h5py.File(self.file_name+'.h5', 'r') as hdf:
18
+ dataset = hdf[dataset_name]
19
+ if isinstance(dataset, h5py.Dataset):
20
+ if hasattr(dataset, 'shape'):
21
+ total_chunks = dataset.shape[0] # Number of chunks in the dataset
22
+ sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences)
23
+ else:
24
+ raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.")
25
+ else:
26
+ raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.")
27
+ end_index = min(start_index + self.chunk_size, total_chunks)
28
+ chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences
29
+ chunk_data_decoded = []
30
+ for chunk in chunk_data:
31
+ chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]])
32
+ return total_chunks,chunk_data_decoded, end_index
33
+ def save_fast_text_file(self,file_name,sentences):
34
+ print("i am here")
35
+ with open(file_name, "w", encoding="utf-8") as f:
36
+ for sentence in sentences:
37
+ sentence = sentence.strip()
38
+ if not sentence.endswith("።"):
39
+ sentence += "።"
40
+ f.write(sentence + "\n")
train/fast_text_trainer.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+
3
+ class FastTextTrainer:
4
+ def __init__(self, corpus_file):
5
+ self.corpus_file = corpus_file
6
+ self.model_file = "fasttext_model.bin"
7
+ self.model = None
8
+
9
+ def train_model(self, model_type="skipgram", dim=100, epoch=5, lr=0.05, thread=4):
10
+ print("Training FastText model...")
11
+ self.model = fasttext.train_unsupervised(
12
+ input=self.corpus_file,
13
+ model=model_type,
14
+ dim=dim,
15
+ epoch=epoch,
16
+ lr=lr,
17
+ thread=thread
18
+ )
19
+ self.model.save_model(self.model_file)
20
+ print(f"Model trained and saved to {self.model_file}")
21
+
22
+ def load_model(self):
23
+ print(f"Loading FastText model from {self.model_file}...")
24
+ self.model = fasttext.load_model(self.model_file)
25
+ print("Model loaded successfully.")
26
+
27
+ def get_word_vector(self, word):
28
+ if self.model is None:
29
+ raise ValueError("Model not loaded. Use `train_model` or `load_model` first.")
30
+ return self.model.get_word_vector(word)
train/hdf5_file_manager.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import h5py
2
+
3
+
4
+ class HDF5TextManager:
5
+ def __init__(self,file_name, chunk_size=1000):
6
+ self.chunk_size = chunk_size
7
+ self.file_name=file_name
8
+ def save(self, sentences):
9
+ chunks = [
10
+ sentences[i:i + self.chunk_size]
11
+ for i in range(0, len(sentences), self.chunk_size)
12
+ ]
13
+ with h5py.File(self.file_name, 'w') as hdf:
14
+ # Create a dataset for storing sentence chunks
15
+ dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences
16
+ max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth
17
+ chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk)
18
+
19
+ dataset = hdf.create_dataset('text_data',
20
+ shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk
21
+ dtype=dtype, # Use the string dtype for encoding UTF-8 strings
22
+ maxshape=max_shape, # Unlimited size along the first axis
23
+ chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time)
24
+ compression="gzip") #
25
+ # Write each chunk of sentences into the dataset
26
+ for i, chunk in enumerate(chunks):
27
+ padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
28
+ dataset[i] = padded_chunk
29
+ print(f"Text data has been stored in {self.file_name} with sentence chunks.")
30
+
31
+
32
+ def combine_hdf5_files(self, hdf5_files):
33
+ with h5py.File(self.file_name, 'a') as combined_h5f:
34
+ for hdf5_file in hdf5_files:
35
+ with h5py.File(hdf5_file, 'r') as individual_h5f:
36
+ group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name
37
+ group = combined_h5f.create_group(group_name)
38
+ for dataset_name in individual_h5f.keys():
39
+ data = individual_h5f[dataset_name][:]
40
+ dt = h5py.special_dtype(vlen=str)
41
+ group.create_dataset(dataset_name, data=data, dtype=dt)
42
+ print(f"All HDF5 files combined into {self.file_name}")
43
+
train/main.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hdf5_file_manager import HDF5TextManager
2
+ from text_analyzer import TextAnalyzer
3
+ from data_loader import DataLoader
4
+ from fast_text_trainer import FastTextTrainer
5
+
6
+ filename='GPAC.txt'
7
+ total_size=2005
8
+ # data_loader=DataLoader('GPAC',chunk_size=2005)
9
+ # total,sentences,index=data_loader.load_hdf5()
10
+ # print(total)
11
+ # # # print(sentences)
12
+ # print(f"sentence count {len(sentences)}")
13
+ # text_analyzer=TextAnalyzer(sentences)
14
+ # # print(f"before space word count {len(text_analyzer.get_tokens())}")
15
+ # cleaned_sentences=text_analyzer.get_sentences()
16
+ # data_loader.save_fast_text_file('GPAC_fast.txt',cleaned_sentences)
17
+ # print(f"after space word count {len(text_analyzer.get_tokens())}")
18
+ # manager = HDF5TextManager('GPAC.h5',chunk_size=1000)
19
+ # manager.save(sentences)
20
+
21
+
22
+
23
+ # # Read the combined HDF5 file
24
+ # manager.read_hdf5_file("combined.h5")
25
+ fast_text=FastTextTrainer('GPAC_fast.txt')
26
+ fast_text.train_model(dim=300,epoch=10,thread=52)
27
+
28
+ # Initialize the summarizer
train/text_analyzer.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ class TextAnalyzer:
5
+ def __init__(self,sentences):
6
+ self.sentences=sentences
7
+ self.clean_sentences()
8
+ def get_tokens(self):
9
+ words = [word for sentence in self.sentences for word in sentence.split()]
10
+ return words
11
+ def get_sentences(self):
12
+ return self.sentences
13
+ def clean_sentences(self):
14
+ cleaned_sentences = []
15
+ for sentence in self.sentences:
16
+ # Remove specific punctuation marks
17
+ sentence = re.sub(r'[፣,),(]', '', sentence)
18
+ # Remove extra spaces
19
+ sentence = re.sub(r'\s+', ' ', sentence).strip()
20
+ cleaned_sentences.append(sentence)
21
+ self.sentences=cleaned_sentences
22
+ cleaned_sentences=None
23
+