File size: 1,384 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import pcl.io.file as file
import pcl.os.path as path
import pcl.system.process as process
import pcl.util.list as list
import pcl.util.string as string
component tokenizer
input corpus.filename
output corpus.tokenised.filename
configuration corpus.language, working.directory.root, moses.installation
do
language <- string.lower(@corpus.language)
corpus.file.basename <- path.basename(corpus.filename)
corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
list.insert(corpus.file.basename.bits, -1, "tok")
result.basename <- string.join(corpus.file.basename.bits, ".")
result.pathname <- path.join(@working.directory.root, result.basename)
working.exists <- path.exists(@working.directory.root)
if working.exists == False then
path.makedirs(@working.directory.root)
return ()
else
return ()
endif
tokeniser.cmd <- path.join(@moses.installation, "scripts",
"tokenizer", "tokenizer.perl")
tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
corpus.file <- file.openFile(corpus.filename, "r")
result.file <- file.openFile(result.pathname, "w")
process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
file.closeFile(result.file)
file.closeFile(corpus.file)
return corpus.tokenised.filename <- result.pathname
|