|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Preprocesses pretrained word embeddings, creates dev sets for tasks without a |
|
provided one, and figures out the set of output classes for each task. |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import os |
|
import random |
|
|
|
from base import configure |
|
from base import embeddings |
|
from base import utils |
|
from task_specific.word_level import word_level_data |
|
|
|
|
|
def main(data_dir='./data'): |
|
random.seed(0) |
|
|
|
utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") |
|
for pretrained in ['glove.6B.300d.txt']: |
|
config = configure.Config(data_dir=data_dir, |
|
for_preprocessing=True, |
|
pretrained_embeddings=pretrained, |
|
word_embedding_size=300) |
|
embeddings.PretrainedEmbeddingLoader(config).build() |
|
|
|
utils.log("CONSTRUCTING DEV SETS") |
|
for task_name in ["chunk"]: |
|
|
|
|
|
config = configure.Config(data_dir=data_dir, |
|
for_preprocessing=True) |
|
task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' |
|
train_sentences = word_level_data.TaggedDataLoader( |
|
config, task_name, False).get_labeled_sentences("train") |
|
random.shuffle(train_sentences) |
|
write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) |
|
write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) |
|
|
|
utils.log("WRITING LABEL MAPPINGS") |
|
for task_name in ["chunk"]: |
|
for i, label_encoding in enumerate(["BIOES"]): |
|
config = configure.Config(data_dir=data_dir, |
|
for_preprocessing=True, |
|
label_encoding=label_encoding) |
|
token_level = task_name in ["ccg", "pos", "depparse"] |
|
loader = word_level_data.TaggedDataLoader(config, task_name, token_level) |
|
if token_level: |
|
if i != 0: |
|
continue |
|
utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) |
|
else: |
|
utils.log(" Writing label mapping for", task_name.upper(), |
|
label_encoding) |
|
utils.log(" ", len(loader.label_mapping), "classes") |
|
utils.write_cpickle(loader.label_mapping, |
|
loader.label_mapping_path) |
|
|
|
|
|
def write_sentences(fname, sentences): |
|
with open(fname, 'w') as f: |
|
for words, tags in sentences: |
|
for word, tag in zip(words, tags): |
|
f.write(word + " " + tag + "\n") |
|
f.write("\n") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|