sakharamg's picture
Uploading all files
158b61b
import pcl.io.file as file
import pcl.os.path as path
import pcl.system.process as process
import pcl.util.list as list
import pcl.util.string as string
component tokenizer
input corpus.filename
output corpus.tokenised.filename
configuration corpus.language, working.directory.root, moses.installation
do
language <- string.lower(@corpus.language)
corpus.file.basename <- path.basename(corpus.filename)
corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
list.insert(corpus.file.basename.bits, -1, "tok")
result.basename <- string.join(corpus.file.basename.bits, ".")
result.pathname <- path.join(@working.directory.root, result.basename)
working.exists <- path.exists(@working.directory.root)
if working.exists == False then
path.makedirs(@working.directory.root)
return ()
else
return ()
endif
tokeniser.cmd <- path.join(@moses.installation, "scripts",
"tokenizer", "tokenizer.perl")
tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
corpus.file <- file.openFile(corpus.filename, "r")
result.file <- file.openFile(result.pathname, "w")
process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
file.closeFile(result.file)
file.closeFile(corpus.file)
return corpus.tokenised.filename <- result.pathname