import os from datetime import datetime from typing import List from zipfile import ZipFile import pandas as pd def partition_sentences(sentences: List[str], min_words: int): current_batch = [] word_count = 0 for sentence in sentences: # count the number of words in the sentence word_count += len(sentence.split()) # add sentence to the current batch current_batch.append(sentence) # if the word count exceeds or equals the minimum threshold, yield the current batch if word_count >= min_words: yield " ".join(current_batch) current_batch = [] # reset the batch word_count = 0 # reset the word count # yield the remaining batch if it's not empty if current_batch: yield " ".join(current_batch) def dump_all(pipeline, text_batches: List[str], knowledge_graph: pd.DataFrame, to_path: str=None): """Save all items to ZIP.""" # metadata date = str(datetime.now().date()) # convert batches to df batches_df = pd.DataFrame(text_batches, columns=["text"]) # date + hex id for local saving num = 0 while True: hex_num = format(num, 'X').zfill(3) filename = f"output-{date}-{hex_num}.zip" zip_path = os.path.join(to_path, filename) if os.path.exists(zip_path): num += 1 else: break print(f"Run ID: {date}-{hex_num}") os.makedirs(to_path, exist_ok=True) # create ZIP file with ZipFile(zip_path, 'w') as zipObj: zipObj.writestr("config.ini", pipeline.serialize()) zipObj.writestr("text_batches.csv", batches_df.to_csv(index_label="batch_id")) zipObj.writestr("kg.csv", knowledge_graph.to_csv(index=False)) print(f"Saved data to {zip_path}") return zip_path