from combo.predict import COMBO from allennlp.data import tokenizers from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--parser') parser.add_argument('--infile') parser.add_argument('--pretokenized', action='store_true') args = parser.parse_args() # If your data is pre-tokenized, you can add the --pretokenized flag # If you have a GPU available, you can add cuda_device= to COMBO.from_pretrained # The parser expects input in the same format as test_file.txt, i.e. one sentence per line if args.pretokenized: from Tokenizer.src.tokenizer import split_into_sentences nlp = COMBO.from_pretrained('combo-is-combined-v211', tokenizer=tokenizers.SpacyTokenizer(split_on_spaces=True)) else: nlp = COMBO.from_pretrained(args.parser) def read_test_file(file): with open(file, 'r', encoding='utf-8') as infile: for line in infile: if args.pretokenized: yield ' '.join(split_into_sentences(line)) else: yield line.rstrip() test_file = read_test_file(args.infile) for sent in test_file: sentence = nlp(sent) for index, token in enumerate(sentence.tokens, 1): print(f'{token.id}\t{token.token}\t{token.lemma}\t{token.upostag}\t{token.xpostag}\t{token.feats}\t{token.head}\t{token.deprel}\t{token.deps}\t{token.misc}') print()