berito's picture
train code added
a608bb4
raw
history blame
964 Bytes
from hdf5_file_manager import HDF5TextManager
from text_analyzer import TextAnalyzer
from data_loader import DataLoader
from fast_text_trainer import FastTextTrainer
filename='GPAC.txt'
total_size=2005
# data_loader=DataLoader('GPAC',chunk_size=2005)
# total,sentences,index=data_loader.load_hdf5()
# print(total)
# # # print(sentences)
# print(f"sentence count {len(sentences)}")
# text_analyzer=TextAnalyzer(sentences)
# # print(f"before space word count {len(text_analyzer.get_tokens())}")
# cleaned_sentences=text_analyzer.get_sentences()
# data_loader.save_fast_text_file('GPAC_fast.txt',cleaned_sentences)
# print(f"after space word count {len(text_analyzer.get_tokens())}")
# manager = HDF5TextManager('GPAC.h5',chunk_size=1000)
# manager.save(sentences)
# # Read the combined HDF5 file
# manager.read_hdf5_file("combined.h5")
fast_text=FastTextTrainer('GPAC_fast.txt')
fast_text.train_model(dim=300,epoch=10,thread=52)
# Initialize the summarizer