File size: 1,384 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pcl.io.file as file
import pcl.os.path as path
import pcl.system.process as process
import pcl.util.list as list
import pcl.util.string as string

component tokenizer
  input corpus.filename
  output corpus.tokenised.filename
  configuration corpus.language, working.directory.root, moses.installation
  do
    language <- string.lower(@corpus.language)

    corpus.file.basename <- path.basename(corpus.filename)
    corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
    list.insert(corpus.file.basename.bits, -1, "tok")
    result.basename <- string.join(corpus.file.basename.bits, ".")
    result.pathname <- path.join(@working.directory.root, result.basename)

    working.exists <- path.exists(@working.directory.root)
    if working.exists == False then
      path.makedirs(@working.directory.root)
      return ()
    else
      return ()
    endif

    tokeniser.cmd <- path.join(@moses.installation, "scripts",
                               "tokenizer", "tokenizer.perl")
    tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")

    corpus.file <- file.openFile(corpus.filename, "r")
    result.file <- file.openFile(result.pathname, "w")
    process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
    file.closeFile(result.file)
    file.closeFile(corpus.file)

    return corpus.tokenised.filename <- result.pathname