File size: 1,869 Bytes
4b9251f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
from datetime import datetime
from typing import List
from zipfile import ZipFile

import pandas as pd


def partition_sentences(sentences: List[str], min_words: int):
    current_batch = []
    word_count = 0
    
    for sentence in sentences:
        # count the number of words in the sentence
        word_count += len(sentence.split())
        
        # add sentence to the current batch
        current_batch.append(sentence) 
        
        # if the word count exceeds or equals the minimum threshold, yield the current batch
        if word_count >= min_words:
            yield " ".join(current_batch)
            current_batch = []  # reset the batch
            word_count = 0      # reset the word count
    
    # yield the remaining batch if it's not empty
    if current_batch:
        yield " ".join(current_batch)


def dump_all(pipeline, text_batches: List[str], knowledge_graph: pd.DataFrame, to_path: str=None):
    """Save all items to ZIP."""
    # metadata
    date = str(datetime.now().date())

    # convert batches to df
    batches_df = pd.DataFrame(text_batches, columns=["text"])

    # date + hex id for local saving
    num = 0
    while True:
        hex_num = format(num, 'X').zfill(3)
        filename = f"output-{date}-{hex_num}.zip"
        zip_path = os.path.join(to_path, filename)
        
        if os.path.exists(zip_path):
            num += 1
        else:
            break
    
    print(f"Run ID: {date}-{hex_num}")
    os.makedirs(to_path, exist_ok=True)
    
    # create ZIP file
    with ZipFile(zip_path, 'w') as zipObj:
        zipObj.writestr("config.ini", pipeline.serialize())
        zipObj.writestr("text_batches.csv", batches_df.to_csv(index_label="batch_id"))
        zipObj.writestr("kg.csv", knowledge_graph.to_csv(index=False))

    print(f"Saved data to {zip_path}")
    
    return zip_path