Spaces:

berito
/

am_text_summary

Running

App Files Files Community

berito commited on Jan 27

Commit

a608bb4

1 Parent(s): 02b2abd

train code added

Browse files

Files changed (5) hide show

train/data_loader.py +40 -0
train/fast_text_trainer.py +30 -0
train/hdf5_file_manager.py +43 -0
train/main.py +28 -0
train/text_analyzer.py +23 -0

train/data_loader.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import h5py
+import re
+class DataLoader:
+    def __init__(self,file_name, chunk_size=10):
+        self.file_name=file_name
+        self.chunk_size=chunk_size
+    def load_txt(self):
+        with open(self.file_name+".txt", 'r', encoding='utf-8') as file:
+            text_data = file.read()
+        delimiters = r'፡፡|::|:|\.|\?|!'
+        # Split into sentences
+        sentences = re.split(delimiters,text_data)
+        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+        return sentences
+    def load_hdf5(self,start_index=0):
+        dataset_name = 'text_data'
+        with h5py.File(self.file_name+'.h5', 'r') as hdf:
+            dataset = hdf[dataset_name]
+            if isinstance(dataset, h5py.Dataset):
+                if hasattr(dataset, 'shape'):
+                    total_chunks = dataset.shape[0]  # Number of chunks in the dataset
+                    sentences_per_chunk = dataset.shape[1]  # Sentences per chunk (e.g., 10 sentences)
+                else:
+                    raise ValueError(f"Dataset '{dataset_name}' does not have a shape attribute.")
+            else:
+                raise TypeError(f"Dataset '{dataset_name}' is not a valid h5py.Dataset.")
+            end_index = min(start_index + self.chunk_size, total_chunks)
+            chunk_data = dataset[start_index:end_index]  # This retrieves a chunk of sentences
+            chunk_data_decoded = []
+            for chunk in chunk_data:
+                chunk_data_decoded.extend([sentence.decode('utf-8') for sentence in chunk[:self.chunk_size]])
+            return total_chunks,chunk_data_decoded, end_index
+    def save_fast_text_file(self,file_name,sentences):
+        print("i am here")
+        with open(file_name, "w", encoding="utf-8") as f:
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if not sentence.endswith("።"):
+                    sentence += "።"
+                f.write(sentence + "\n")

train/fast_text_trainer.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import fasttext
+class FastTextTrainer:
+    def __init__(self, corpus_file):
+        self.corpus_file = corpus_file
+        self.model_file = "fasttext_model.bin"
+        self.model = None
+    def train_model(self, model_type="skipgram", dim=100, epoch=5, lr=0.05, thread=4):
+        print("Training FastText model...")
+        self.model = fasttext.train_unsupervised(
+            input=self.corpus_file,
+            model=model_type,
+            dim=dim,
+            epoch=epoch,
+            lr=lr,
+            thread=thread
+        )
+        self.model.save_model(self.model_file)
+        print(f"Model trained and saved to {self.model_file}")
+    def load_model(self):
+        print(f"Loading FastText model from {self.model_file}...")
+        self.model = fasttext.load_model(self.model_file)
+        print("Model loaded successfully.")
+    def get_word_vector(self, word):
+        if self.model is None:
+            raise ValueError("Model not loaded. Use `train_model` or `load_model` first.")
+        return self.model.get_word_vector(word)

train/hdf5_file_manager.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import h5py
+class HDF5TextManager:
+    def __init__(self,file_name, chunk_size=1000):
+        self.chunk_size = chunk_size
+        self.file_name=file_name
+    def save(self, sentences):
+        chunks = [
+            sentences[i:i + self.chunk_size]
+            for i in range(0, len(sentences), self.chunk_size)
+        ]
+        with h5py.File(self.file_name, 'w') as hdf:
+            # Create a dataset for storing sentence chunks
+            dtype = h5py.string_dtype(encoding='utf-8')  # Use UTF-8 encoding for the sentences
+            max_shape = (None, self.chunk_size)  # Unlimited size for dynamic growth
+            chunk_shape = (1,self.chunk_size)  # Define chunk size (number of sentences per chunk)
+            dataset = hdf.create_dataset('text_data',
+                                        shape=(len(chunks), self.chunk_size),  # Number of chunks and sentences per chunk
+                                        dtype=dtype,  # Use the string dtype for encoding UTF-8 strings
+                                        maxshape=max_shape,  # Unlimited size along the first axis
+                                        chunks=chunk_shape,#(1, sentences_per_chunk),  # Chunking along the first axis (1 chunk at a time)
+                                        compression="gzip")  #
+            # Write each chunk of sentences into the dataset
+            for i, chunk in enumerate(chunks):
+               padded_chunk = chunk + [""] * (self.chunk_size - len(chunk))
+               dataset[i] = padded_chunk
+        print(f"Text data has been stored in {self.file_name} with sentence chunks.")
+    def combine_hdf5_files(self, hdf5_files):
+        with h5py.File(self.file_name, 'a') as combined_h5f:
+            for hdf5_file in hdf5_files:
+                with h5py.File(hdf5_file, 'r') as individual_h5f:
+                    group_name = hdf5_file.split('.')[0]  # Use filename (without extension) as group name
+                    group = combined_h5f.create_group(group_name)
+                    for dataset_name in individual_h5f.keys():
+                        data = individual_h5f[dataset_name][:]
+                        dt = h5py.special_dtype(vlen=str)
+                        group.create_dataset(dataset_name, data=data, dtype=dt)
+        print(f"All HDF5 files combined into {self.file_name}")

train/main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from hdf5_file_manager  import HDF5TextManager
+from text_analyzer import TextAnalyzer
+from data_loader import DataLoader
+from fast_text_trainer import FastTextTrainer
+filename='GPAC.txt'
+total_size=2005
+# data_loader=DataLoader('GPAC',chunk_size=2005)
+# total,sentences,index=data_loader.load_hdf5()
+# print(total)
+# # # print(sentences)
+# print(f"sentence count {len(sentences)}")
+# text_analyzer=TextAnalyzer(sentences)
+# # print(f"before space word count {len(text_analyzer.get_tokens())}")
+# cleaned_sentences=text_analyzer.get_sentences()
+# data_loader.save_fast_text_file('GPAC_fast.txt',cleaned_sentences)
+# print(f"after space word count {len(text_analyzer.get_tokens())}")
+# manager = HDF5TextManager('GPAC.h5',chunk_size=1000)
+# manager.save(sentences)
+# # Read the combined HDF5 file
+# manager.read_hdf5_file("combined.h5")
+fast_text=FastTextTrainer('GPAC_fast.txt')
+fast_text.train_model(dim=300,epoch=10,thread=52)
+# Initialize the summarizer

train/text_analyzer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import re
+class TextAnalyzer:
+    def __init__(self,sentences):
+        self.sentences=sentences
+        self.clean_sentences()
+    def get_tokens(self):
+        words = [word for sentence in self.sentences for word in sentence.split()]
+        return words
+    def get_sentences(self):
+        return self.sentences
+    def clean_sentences(self):
+        cleaned_sentences = []
+        for sentence in self.sentences:
+            # Remove specific punctuation marks
+            sentence = re.sub(r'[፣,),(]', '', sentence)
+            # Remove extra spaces
+            sentence = re.sub(r'\s+', ' ', sentence).strip()
+            cleaned_sentences.append(sentence)
+        self.sentences=cleaned_sentences
+        cleaned_sentences=None