Spaces:

nielklug
/

mhg-parsing

Sleeping

File size: 3,105 Bytes

6ed21b9

BUCKWALTER_MAP = {
    '\'': '\u0621',
    '|': '\u0622',
    '>': '\u0623',
    'O': '\u0623',
    '&': '\u0624',
    'W': '\u0624',
    '<': '\u0625',
    'I': '\u0625',
    '}': '\u0626',
    'A': '\u0627',
    'b': '\u0628',
    'p': '\u0629',
    't': '\u062A',
    'v': '\u062B',
    'j': '\u062C',
    'H': '\u062D',
    'x': '\u062E',
    'd': '\u062F',
    '*': '\u0630',
    'r': '\u0631',
    'z': '\u0632',
    's': '\u0633',
    '$': '\u0634',
    'S': '\u0635',
    'D': '\u0636',
    'T': '\u0637',
    'Z': '\u0638',
    'E': '\u0639',
    'g': '\u063A',
    '_': '\u0640',
    'f': '\u0641',
    'q': '\u0642',
    'k': '\u0643',
    'l': '\u0644',
    'm': '\u0645',
    'n': '\u0646',
    'h': '\u0647',
    'w': '\u0648',
    'Y': '\u0649',
    'y': '\u064A',
    'F': '\u064B',
    'N': '\u064C',
    'K': '\u064D',
    'a': '\u064E',
    'u': '\u064F',
    'i': '\u0650',
    '~': '\u0651',
    'o': '\u0652',
    '`': '\u0670',
    '{': '\u0671',
}

BUCKWALTER_UNESCAPE = {
    "-LRB-": "(",
    "-RRB-": ")",
    "-LCB-": "{",
    "-RCB-": "}",
    "-LSB-": "[",
    "-RSB-": "]",
    '-PLUS-': "+",
    '-MINUS-': "-",
}

BUCKWALTER_UNCHANGED = set('.?!,"%-/:;=')

HEBREW_MAP = {
    'A': '\u05d0',
    'B': '\u05d1',
    'G': '\u05d2',
    'D': '\u05d3',
    'H': '\u05d4',
    'W': '\u05d5',
    'Z': '\u05d6',
    'X': '\u05d7',
    'J': '\u05d8',
    'I': '\u05d9',
    'K': '\u05db',
    'L': '\u05dc',
    'M': '\u05de',
    'N': '\u05e0',
    'S': '\u05e1',
    'E': '\u05e2',
    'P': '\u05e4',
    'C': '\u05e6',
    'Q': '\u05e7',
    'R': '\u05e8',
    'F': '\u05e9',
    'T': '\u05ea',
    '0': '0',
    '1': '1',
    '2': '2',
    '3': '3',
    '4': '4',
    '5': '5',
    '6': '6',
    '7': '7',
    '8': '8',
    '9': '9',
    'U': '"',
    'O': '%',
    '.': '.',
    ',': ',',
}

HEBREW_SUFFIX_MAP = {
    '\u05db': '\u05da',
    '\u05de': '\u05dd',
    '\u05e0': '\u05df',
    '\u05e4': '\u05e3',
    '\u05e6': '\u05e5',
}

HEBREW_UNESCAPE = {
    "yyCLN": ":",
    "yyCM": ",",
    "yyDASH": "-",
    "yyDOT": ".",
    "yyELPS": "...",
    "yyEXCL": "!",
    "yyLRB": "(",
    "yyQM": "?",
    "yyRRB": ")",
    "yySCLN": ";",
}



def arabic(inp):
    """
    Undo Buckwalter transliteration

    See: http://languagelog.ldc.upenn.edu/myl/ldc/morph/buckwalter.html

    This code inspired by:
    https://github.com/dlwh/epic/blob/master/src/main/scala/epic/util/ArabicNormalization.scala
    """
    return "".join(
        BUCKWALTER_MAP.get(char, char)
        for char in BUCKWALTER_UNESCAPE.get(inp, inp))

def hebrew(inp):
    """
    Undo Hebrew transliteration

    See: http://www.phil.uu.nl/ozsl/articles/simaan02.pdf

    This code inspired by:
    https://github.com/habeanf/yap/blob/b57502364b73ef78f3510eb890319ae268eeacca/nlp/parser/xliter8/types.go
    """
    out = "".join(
        HEBREW_MAP.get(char, char)
        for char in HEBREW_UNESCAPE.get(inp, inp))
    if out and (out[-1] in HEBREW_SUFFIX_MAP):
        out = out[:-1] + HEBREW_SUFFIX_MAP[out[-1]]
    return out

TRANSLITERATIONS = {
    'arabic': arabic,
    'hebrew': hebrew,
}