File size: 5,110 Bytes
7332c68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# single file can be split to some small files and run on different gpus
import torch
import torch.distributed as dist
import sys
sys.path.append("..")

import pytest
import glob
import tqdm
import os
import argparse
import stanza
import json
from transformers import AutoTokenizer

def chunk_text(text, tokenizer, max_length=512):
    tokens = tokenizer(text)['input_ids']
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

def init_distributed_mode():
    dist.init_process_group(backend='nccl')
    rank = dist.get_rank()
    torch.cuda.set_device(rank)  # 使用rank指定GPU
    return rank

def process_single_file(file, rank, tokenizer, nlp1, nlp2):
    print(f"GPU {rank}: Processing {file.name}")
    lines = file.readlines()

    # 根据行数划分任务
    num_lines = len(lines)
    num_gpus = dist.get_world_size()

    lines_per_gpu = (num_lines + num_gpus - 1) // num_gpus
    start_idx = rank * lines_per_gpu
    end_idx = min(start_idx + lines_per_gpu, num_lines)
    gpu_lines = lines[start_idx:end_idx]

    line_batches = [gpu_lines[i:i + BATCH_SIZE] for i in range(0, len(gpu_lines), BATCH_SIZE)]
    text_batches = [" ".join(l) for l in line_batches]

    line_annotations = []
    for text in tqdm.tqdm(text_batches, desc=f"GPU {rank}"):
        text_chunks = chunk_text(text, tokenizer)
        for chunk in text_chunks:
            doc = nlp1(chunk)
            sent_annotations = []
            for sent in doc.sentences:
                word_annotations = []
                for token, word in zip(sent.tokens, sent.words):
                    wa = {
                        'id': word.id,
                        'text': word.text,
                        'lemma': word.lemma,
                        'upos': word.upos,
                        'xpos': word.xpos,
                        'feats': word.feats,
                        'start_char': token.start_char,
                        'end_char': token.end_char
                    }
                    word_annotations.append(wa)

                sa = {
                    'sent_text': sent.text,
                    'word_annotations': word_annotations
                }
                if args.parse:
                    sa['constituency_parse'] = __get_constituency_parse(sent, nlp2)
                
                sent_annotations.append(sa)
            line_annotations.append({'sent_annotations': sent_annotations})

    # 暂存不同GPU的输出
    temp_filename = os.path.splitext(file.name)[0] + f'_rank{rank}.json'
    with open(temp_filename, "w") as outfile:
        json.dump(line_annotations, outfile, indent=4)

    return temp_filename

def merge_files(temp_files, output_file):
    merged_data = []
    for file in temp_files:
        with open(file, "r") as infile:
            data = json.load(infile)
            merged_data.extend(data)
        os.remove(file)  # 删除临时文件
    
    with open(output_file, "w") as outfile:
        json.dump(merged_data, outfile, indent=4)

def run_on_gpu(rank, args, tokenizer, nlp1, nlp2):
    print(f"Running on Rank {rank}, using GPU {torch.cuda.current_device()}")

    temp_files = []
    if len(args.path) == 1:
        temp_files.append(process_single_file(args.path[0], rank, tokenizer, nlp1, nlp2))
        dist.barrier()  # 等待所有进程完成处理
        if rank == 0:
            # 合并文件
            final_output = os.path.splitext(args.path[0].name)[0] + '_merged.json'
            merge_files(temp_files, final_output)
    else:
        files_per_gpu = len(args.path) // dist.get_world_size()
        start_idx = rank * files_per_gpu
        end_idx = start_idx + files_per_gpu if rank != dist.get_world_size() - 1 else len(args.path)
        gpu_files = args.path[start_idx:end_idx]
        
        for file in gpu_files:
            process_single_file(file, rank, tokenizer, nlp1, nlp2)

def __get_constituency_parse(sent, nlp):
    try:
        parse_doc = nlp(sent.text)
    except:
        return None
    parse_trees = [str(sent.constituency) for sent in parse_doc.sentences]
    return "(ROOT " + " ".join(parse_trees) + ")"

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog='Tag BabyLM dataset',
        description='Tag BabyLM dataset using Stanza')
    parser.add_argument('path', type=argparse.FileType('r'),
                        nargs='+', help="Path to file(s)")
    parser.add_argument('-p', '--parse', action='store_true',
                        help="Include constituency parse")
    args = parser.parse_args()

    rank = init_distributed_mode()

    BATCH_SIZE = 1000
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    nlp1 = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package="default_accurate", use_gpu=True)

    nlp2 = None
    if args.parse:
        nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package="default_accurate", use_gpu=True)

    run_on_gpu(rank, args, tokenizer, nlp1, nlp2)