NMTKD
/
translation
/tools
/mosesdecoder
/contrib
/arrow-pipelines
/pcl
/components
/wrappers
/tokenizer
/tokenizer.pcl
import pcl.io.file as file | |
import pcl.os.path as path | |
import pcl.system.process as process | |
import pcl.util.list as list | |
import pcl.util.string as string | |
component tokenizer | |
input corpus.filename | |
output corpus.tokenised.filename | |
configuration corpus.language, working.directory.root, moses.installation | |
do | |
language <- string.lower(@corpus.language) | |
corpus.file.basename <- path.basename(corpus.filename) | |
corpus.file.basename.bits <- string.split(corpus.file.basename, ".") | |
list.insert(corpus.file.basename.bits, -1, "tok") | |
result.basename <- string.join(corpus.file.basename.bits, ".") | |
result.pathname <- path.join(@working.directory.root, result.basename) | |
working.exists <- path.exists(@working.directory.root) | |
if working.exists == False then | |
path.makedirs(@working.directory.root) | |
return () | |
else | |
return () | |
endif | |
tokeniser.cmd <- path.join(@moses.installation, "scripts", | |
"tokenizer", "tokenizer.perl") | |
tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q") | |
corpus.file <- file.openFile(corpus.filename, "r") | |
result.file <- file.openFile(result.pathname, "w") | |
process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file) | |
file.closeFile(result.file) | |
file.closeFile(corpus.file) | |
return corpus.tokenised.filename <- result.pathname | |