Spaces:
Sleeping
Sleeping
train code added
Browse files- train/data_loader.py +40 -0
- train/fast_text_trainer.py +30 -0
- train/hdf5_file_manager.py +43 -0
- train/main.py +28 -0
- train/text_analyzer.py +23 -0
train/data_loader.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import h5py
|
2 |
+
import re
|
3 |
+
class DataLoader:
|
4 |
+
def __init__(self,file_name, chunk_size=10):
|
5 |
+
self.file_name=file_name
|
6 |
+
self.chunk_size=chunk_size
|
7 |
+
def load_txt(self):
|
8 |
+
with open(self.file_name+".txt", 'r', encoding='utf-8') as file:
|
9 |
+
text_data = file.read()
|
10 |
+
delimiters = r'፡፡|::|:|\.|\?|!'
|
11 |
+
# Split into sentences
|
12 |
+
sentences = re.split(delimiters,text_data)
|
13 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
14 |
+
return sentences
|
15 |
+
def load_hdf5(self,start_index=0):
|
16 |
+
dataset_name = 'text_data'
|
17 |
+
with h5py.File(self.file_name+'.h5', 'r') as hdf:
|
18 |
+
dataset = hdf[dataset_name]
|
19 |
+
if isinstance(dataset, h5py.Dataset):
|
20 |
+
if hasattr(dataset, 'shape'):
|
21 |
+
total_chunks = dataset.shape[0] # Number of chunks in the dataset
|
22 |
+
sentences_per_chunk = dataset.shape[1] # Sentences per chunk (e.g., 10 sentences)
|
23 |
+
else:
|
24 |
+
raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.")
|
25 |
+
else:
|
26 |
+
raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.")
|
27 |
+
end_index = min(start_index + self.chunk_size, total_chunks)
|
28 |
+
chunk_data = dataset[start_index:end_index] # This retrieves a chunk of sentences
|
29 |
+
chunk_data_decoded = []
|
30 |
+
for chunk in chunk_data:
|
31 |
+
chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]])
|
32 |
+
return total_chunks,chunk_data_decoded, end_index
|
33 |
+
def save_fast_text_file(self,file_name,sentences):
|
34 |
+
print("i am here")
|
35 |
+
with open(file_name, "w", encoding="utf-8") as f:
|
36 |
+
for sentence in sentences:
|
37 |
+
sentence = sentence.strip()
|
38 |
+
if not sentence.endswith("።"):
|
39 |
+
sentence += "።"
|
40 |
+
f.write(sentence + "\n")
|
train/fast_text_trainer.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fasttext
|
2 |
+
|
3 |
+
class FastTextTrainer:
|
4 |
+
def __init__(self, corpus_file):
|
5 |
+
self.corpus_file = corpus_file
|
6 |
+
self.model_file = "fasttext_model.bin"
|
7 |
+
self.model = None
|
8 |
+
|
9 |
+
def train_model(self, model_type="skipgram", dim=100, epoch=5, lr=0.05, thread=4):
|
10 |
+
print("Training FastText model...")
|
11 |
+
self.model = fasttext.train_unsupervised(
|
12 |
+
input=self.corpus_file,
|
13 |
+
model=model_type,
|
14 |
+
dim=dim,
|
15 |
+
epoch=epoch,
|
16 |
+
lr=lr,
|
17 |
+
thread=thread
|
18 |
+
)
|
19 |
+
self.model.save_model(self.model_file)
|
20 |
+
print(f"Model trained and saved to {self.model_file}")
|
21 |
+
|
22 |
+
def load_model(self):
|
23 |
+
print(f"Loading FastText model from {self.model_file}...")
|
24 |
+
self.model = fasttext.load_model(self.model_file)
|
25 |
+
print("Model loaded successfully.")
|
26 |
+
|
27 |
+
def get_word_vector(self, word):
|
28 |
+
if self.model is None:
|
29 |
+
raise ValueError("Model not loaded. Use `train_model` or `load_model` first.")
|
30 |
+
return self.model.get_word_vector(word)
|
train/hdf5_file_manager.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import h5py
|
2 |
+
|
3 |
+
|
4 |
+
class HDF5TextManager:
|
5 |
+
def __init__(self,file_name, chunk_size=1000):
|
6 |
+
self.chunk_size = chunk_size
|
7 |
+
self.file_name=file_name
|
8 |
+
def save(self, sentences):
|
9 |
+
chunks = [
|
10 |
+
sentences[i:i + self.chunk_size]
|
11 |
+
for i in range(0, len(sentences), self.chunk_size)
|
12 |
+
]
|
13 |
+
with h5py.File(self.file_name, 'w') as hdf:
|
14 |
+
# Create a dataset for storing sentence chunks
|
15 |
+
dtype = h5py.string_dtype(encoding='utf-8') # Use UTF-8 encoding for the sentences
|
16 |
+
max_shape = (None, self.chunk_size) # Unlimited size for dynamic growth
|
17 |
+
chunk_shape = (1,self.chunk_size) # Define chunk size (number of sentences per chunk)
|
18 |
+
|
19 |
+
dataset = hdf.create_dataset('text_data',
|
20 |
+
shape=(len(chunks), self.chunk_size), # Number of chunks and sentences per chunk
|
21 |
+
dtype=dtype, # Use the string dtype for encoding UTF-8 strings
|
22 |
+
maxshape=max_shape, # Unlimited size along the first axis
|
23 |
+
chunks=chunk_shape,#(1, sentences_per_chunk), # Chunking along the first axis (1 chunk at a time)
|
24 |
+
compression="gzip") #
|
25 |
+
# Write each chunk of sentences into the dataset
|
26 |
+
for i, chunk in enumerate(chunks):
|
27 |
+
padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
|
28 |
+
dataset[i] = padded_chunk
|
29 |
+
print(f"Text data has been stored in {self.file_name} with sentence chunks.")
|
30 |
+
|
31 |
+
|
32 |
+
def combine_hdf5_files(self, hdf5_files):
|
33 |
+
with h5py.File(self.file_name, 'a') as combined_h5f:
|
34 |
+
for hdf5_file in hdf5_files:
|
35 |
+
with h5py.File(hdf5_file, 'r') as individual_h5f:
|
36 |
+
group_name = hdf5_file.split('.')[0] # Use filename (without extension) as group name
|
37 |
+
group = combined_h5f.create_group(group_name)
|
38 |
+
for dataset_name in individual_h5f.keys():
|
39 |
+
data = individual_h5f[dataset_name][:]
|
40 |
+
dt = h5py.special_dtype(vlen=str)
|
41 |
+
group.create_dataset(dataset_name, data=data, dtype=dt)
|
42 |
+
print(f"All HDF5 files combined into {self.file_name}")
|
43 |
+
|
train/main.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from hdf5_file_manager import HDF5TextManager
|
2 |
+
from text_analyzer import TextAnalyzer
|
3 |
+
from data_loader import DataLoader
|
4 |
+
from fast_text_trainer import FastTextTrainer
|
5 |
+
|
6 |
+
filename='GPAC.txt'
|
7 |
+
total_size=2005
|
8 |
+
# data_loader=DataLoader('GPAC',chunk_size=2005)
|
9 |
+
# total,sentences,index=data_loader.load_hdf5()
|
10 |
+
# print(total)
|
11 |
+
# # # print(sentences)
|
12 |
+
# print(f"sentence count {len(sentences)}")
|
13 |
+
# text_analyzer=TextAnalyzer(sentences)
|
14 |
+
# # print(f"before space word count {len(text_analyzer.get_tokens())}")
|
15 |
+
# cleaned_sentences=text_analyzer.get_sentences()
|
16 |
+
# data_loader.save_fast_text_file('GPAC_fast.txt',cleaned_sentences)
|
17 |
+
# print(f"after space word count {len(text_analyzer.get_tokens())}")
|
18 |
+
# manager = HDF5TextManager('GPAC.h5',chunk_size=1000)
|
19 |
+
# manager.save(sentences)
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
# # Read the combined HDF5 file
|
24 |
+
# manager.read_hdf5_file("combined.h5")
|
25 |
+
fast_text=FastTextTrainer('GPAC_fast.txt')
|
26 |
+
fast_text.train_model(dim=300,epoch=10,thread=52)
|
27 |
+
|
28 |
+
# Initialize the summarizer
|
train/text_analyzer.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
class TextAnalyzer:
|
5 |
+
def __init__(self,sentences):
|
6 |
+
self.sentences=sentences
|
7 |
+
self.clean_sentences()
|
8 |
+
def get_tokens(self):
|
9 |
+
words = [word for sentence in self.sentences for word in sentence.split()]
|
10 |
+
return words
|
11 |
+
def get_sentences(self):
|
12 |
+
return self.sentences
|
13 |
+
def clean_sentences(self):
|
14 |
+
cleaned_sentences = []
|
15 |
+
for sentence in self.sentences:
|
16 |
+
# Remove specific punctuation marks
|
17 |
+
sentence = re.sub(r'[፣,),(]', '', sentence)
|
18 |
+
# Remove extra spaces
|
19 |
+
sentence = re.sub(r'\s+', ' ', sentence).strip()
|
20 |
+
cleaned_sentences.append(sentence)
|
21 |
+
self.sentences=cleaned_sentences
|
22 |
+
cleaned_sentences=None
|
23 |
+
|