# the files can be processed on different gpus, each file is processed on a gpu import torch import torch.distributed as dist import sys sys.path.append("..") import pytest import glob import tqdm import os import argparse import stanza import json from transformers import AutoTokenizer def chunk_text(text, tokenizer, max_length=512): tokens = tokenizer(text)['input_ids'] chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)] return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks] def init_distributed_mode(): dist.init_process_group(backend='nccl') rank = dist.get_rank() torch.cuda.set_device(rank) # 使用rank指定GPU return rank def run_on_gpu(rank, args, tokenizer, nlp1, nlp2): print(f"Running on Rank {rank}, using GPU {torch.cuda.current_device()}") print(f"Rank {rank}, GPU {torch.cuda.current_device()} started") files_per_gpu = len(args.path) // dist.get_world_size() start_idx = rank * files_per_gpu end_idx = start_idx + files_per_gpu if rank != dist.get_world_size() - 1 else len(args.path) gpu_files = args.path[start_idx:end_idx] for file in gpu_files: print(f"GPU {rank}: Processing {file.name}") lines = file.readlines() lines = [l.strip() for l in lines] line_batches = [lines[i:i + BATCH_SIZE] for i in range(0, len(lines), BATCH_SIZE)] text_batches = [" ".join(l) for l in line_batches] line_annotations = [] for text in tqdm.tqdm(text_batches, desc=f"GPU {rank}"): text_chunks = chunk_text(text, tokenizer) for chunk in text_chunks: doc = nlp1(chunk) sent_annotations = [] for sent in doc.sentences: word_annotations = [] for token, word in zip(sent.tokens, sent.words): wa = { 'id': word.id, 'text': word.text, 'lemma': word.lemma, 'upos': word.upos, 'xpos': word.xpos, 'feats': word.feats, 'start_char': token.start_char, 'end_char': token.end_char } word_annotations.append(wa) sa = { 'sent_text': sent.text, 'word_annotations': word_annotations } if args.parse: sa['constituency_parse'] = __get_constituency_parse(sent, nlp2) sent_annotations.append(sa) line_annotations.append({'sent_annotations': sent_annotations}) json_filename = os.path.splitext(file.name)[0] + '_parsed.json' if args.parse else '.json' with open(json_filename, "w") as outfile: json.dump(line_annotations, outfile, indent=4) def __get_constituency_parse(sent, nlp): try: parse_doc = nlp(sent.text) except: return None parse_trees = [str(sent.constituency) for sent in parse_doc.sentences] return "(ROOT " + " ".join(parse_trees) + ")" if __name__ == "__main__": parser = argparse.ArgumentParser( prog='Tag BabyLM dataset', description='Tag BabyLM dataset using Stanza') parser.add_argument('path', type=argparse.FileType('r'), nargs='+', help="Path to file(s)") parser.add_argument('-p', '--parse', action='store_true', help="Include constituency parse") args = parser.parse_args() rank = init_distributed_mode() BATCH_SIZE = 1000 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") nlp1 = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package="default_accurate", use_gpu=True) nlp2 = None if args.parse: nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package="default_accurate", use_gpu=True) run_on_gpu(rank, args, tokenizer, nlp1, nlp2)