File size: 3,942 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Starting point for writing this script
# https://github.com/castorini/docTTTTTquery/blob/master/convert_msmarco_passages_doc_to_anserini.py
import argparse
import os
import sys
import gzip
import json
import spacy #Currently using spacy 2.3.5
from tqdm import tqdm
import re
import glob
from multiprocessing import Pool

def create_segments(doc_text, max_length, stride):
    doc_text = doc_text.strip()
    doc = nlp(doc_text[:10000])
    sentences = [sent.string.strip() for sent in doc.sents]
    segments = []
    
    for i in range(0, len(sentences), stride):
        segment = " ".join(sentences[i:i+max_length])
        segments.append(segment)
        if i + max_length >= len(sentences):
            break
    return segments

def split_document(f_ins, f_out): 
    print('Spliting documents...')
    output = open(f_out, 'w')
    output_id = open(f_out.replace(".json", ".id"), 'w')
    for f_in in f_ins:
        with gzip.open(f_in, 'rt', encoding='utf8') as in_fh:
            for json_string in tqdm(in_fh):
                doc = json.loads(json_string)
                f_doc_id = doc['docid']
                doc_url = doc['url']
                doc_title = doc['title']
                doc_headings = doc['headings']
                doc_text = doc['body']

                segments = create_segments(doc_text, args.max_length, args.stride)

                for seg_id, segment in enumerate(segments):
                    # expanded_text = f'{doc_url}\n{doc_headings}\n{doc_title}\n{segment}'
                    doc_seg = f'{f_doc_id}#{seg_id}'
                    output_dict = {'docid': doc_seg, 'url': doc_url, 'title': doc_title, 'headings': doc_headings, 'segment': segment}
                    output.write(json.dumps(output_dict) + '\n')  
                    output_id.write(doc_seg+'\n')

    output.close()
    output_id.close()
    print('Done!')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Segment MS MARCO V2 original docs into passages')
    parser.add_argument('--input', required=True, help='MS MARCO V2 corpus path.')
    parser.add_argument('--output', required=True, help='output file path with json format.')
    parser.add_argument('--max_length', default=10, help='maximum sentence length per passage')
    parser.add_argument('--stride', default=5, help='the distance between each beginning sentence of passage in a document')
    parser.add_argument('--num_workers', default=1, type=int)
    args = parser.parse_args()


    os.makedirs(os.path.dirname(args.output_docs_path), exist_ok=True)


    max_length = args.max_length
    stride = args.stride
    nlp = spacy.blank("en")
    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    files = glob.glob(os.path.join(args.original_docs_path, '*.gz'))
    num_files = len(files)
    pool = Pool(args.num_workers)
    num_files_per_worker=num_files//args.num_workers
    for i in range(args.num_workers):
        f_out = os.path.join(args.output_docs_path, 'doc' + str(i) + '.json')
        if i==(args.num_workers-1):
            file_list = files[i*num_files_per_worker:]
        else:
            file_list = files[i*num_files_per_worker:((i+1)*num_files_per_worker)]

        pool.apply_async(split_document ,(file_list, f_out))

    pool.close()
    pool.join()

    print('Done!')