|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys, os |
|
import lexikon |
|
lex = lexikon.UDlexPT() |
|
|
|
|
|
|
|
|
|
|
|
def parseOptions(arguments): |
|
|
|
output_file, input_file, match, trim, model = "", [], False, False, "S0000" |
|
i = 1 |
|
while i < len(arguments): |
|
if (arguments[i][0] == "-"): |
|
|
|
if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ |
|
(arguments[i] == "-help"): |
|
print("Opções:\n-h ajuda\n-o arquivo de saída", \ |
|
"-m corrige pontuações casadas (aspas, parenteses, etc)", \ |
|
"-t remove possíveis MANCHETES que precedem as frases", \ |
|
"Exemplo de utilização:", \ |
|
"portTok -o sents.conllu -m -t -s S0000 sents.txt", \ |
|
"Busca as sentenças no arquivo 'sents.txt',", \ |
|
" corrige pontuações casadas (aspas, parenteses, etc),", \ |
|
" remove possíveis MANCHETES que precedem as frases", \ |
|
" usa S0000 como modelo de identificador de sentença e" |
|
" salva as sentenças devidamente tokenizadas no arquivo 'sents.conllu''", \ |
|
sep="\n") |
|
return None |
|
|
|
elif ((arguments[i][1] == "m") and (len(arguments[i])==2)) or \ |
|
(arguments[i] == "-match"): |
|
match = True |
|
i += 1 |
|
|
|
elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \ |
|
(arguments[i] == "-trim"): |
|
trim = True |
|
i += 1 |
|
|
|
elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \ |
|
(arguments[i] == "-sid"): |
|
try: |
|
model = arguments[i+1] |
|
i += 2 |
|
except: |
|
print("modelo de identificador de sentença não informado - assumindo S000") |
|
i += 1 |
|
|
|
elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ |
|
(arguments[i] == "-output"): |
|
output_file = arguments[i+1] |
|
i += 2 |
|
|
|
else: |
|
print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i])) |
|
return None |
|
|
|
else: |
|
if (os.path.isfile(arguments[i])): |
|
input_file = arguments[i] |
|
i += 1 |
|
else: |
|
print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i])) |
|
return None |
|
return [output_file, input_file, match, trim, model] |
|
|
|
|
|
|
|
|
|
def nextName(name): |
|
|
|
ans = "" |
|
while name != "": |
|
digit, name = name[-1], name[:-1] |
|
if digit == "9": |
|
ans = "0" + ans |
|
elif digit == "8": |
|
ans = "9" + ans |
|
return name+ans |
|
elif digit == "7": |
|
ans = "8" + ans |
|
return name+ans |
|
elif digit == "6": |
|
ans = "7" + ans |
|
return name+ans |
|
elif digit == "5": |
|
ans = "6" + ans |
|
return name+ans |
|
elif digit == "4": |
|
ans = "5" + ans |
|
return name+ans |
|
elif digit == "3": |
|
ans = "4" + ans |
|
return name+ans |
|
elif digit == "2": |
|
ans = "3" + ans |
|
return name+ans |
|
elif digit == "1": |
|
ans = "2" + ans |
|
return name+ans |
|
elif digit == "0": |
|
ans = "1" + ans |
|
return name+ans |
|
else: |
|
ans = "1" + ans |
|
return name+ans |
|
return "overflow"+ans |
|
|
|
|
|
|
|
|
|
def trimIt(s): |
|
|
|
bits = s.strip().replace(" ", " ").replace(" ", " ").split(" ") |
|
start = 0 |
|
|
|
if (bits[0] in ["*", "★", "-", "—", "–", ">", "."]): |
|
if (len(bits) == 1): |
|
return "" |
|
else: |
|
start = 1 |
|
|
|
if (bits[start][0] == "(") and (bits[-1][-1] != ")"): |
|
for i in range(len(bits)): |
|
if (bits[i][-1] == ")"): |
|
start = i+1 |
|
break |
|
|
|
i = start |
|
while (i<len(bits)): |
|
if (bits[i].isupper()): |
|
start = i |
|
i += 1 |
|
else: |
|
break |
|
if ((len(bits[start]) > 1) and (bits[start].isupper())) and \ |
|
(start+1 < len(bits)): |
|
if (bits[start+1][0].isupper()): |
|
start += 1 |
|
ans = bits[start] |
|
for i in range(start+1,len(bits)): |
|
ans += " "+bits[i] |
|
return ans |
|
|
|
|
|
|
|
|
|
def punctIt(s): |
|
def notAlpha(sent): |
|
ans = True |
|
for c in sent: |
|
if c.isalpha(): |
|
ans = False |
|
break |
|
return ans |
|
doubleQuotes = s.count('"') |
|
singleQuotes = s.count("'") |
|
openParentes = s.count("(") |
|
closParentes = s.count(")") |
|
openBrackets = s.count("[") |
|
closBrackets = s.count("]") |
|
openCurBrace = s.count("{") |
|
closCurBrace = s.count("}") |
|
openAligator = s.count("<") |
|
closAligator = s.count(">") |
|
if ((doubleQuotes == 2 ) and (s[0] == '"') and (s[-1] == '"')) or \ |
|
((singleQuotes == 2 ) and (s[0] == "'") and (s[-1] == "'")) or \ |
|
((openParentes == 1 ) and (closParentes == 1 ) and (s[0] == "(") and (s[-1] == ")")) or \ |
|
((openBrackets == 1 ) and (closBrackets == 1 ) and (s[0] == "[") and (s[-1] == "]")) or \ |
|
((openCurBrace == 1 ) and (closCurBrace == 1 ) and (s[0] == "{") and (s[-1] == "}")) or \ |
|
((openAligator == 1 ) and (closAligator == 1 ) and (s[0] == "<") and (s[-1] == ">")): |
|
S = s[1:-1] |
|
else: |
|
S = s |
|
if (doubleQuotes % 2 != 0): |
|
S = S.replace('"', '') |
|
if (singleQuotes % 2 != 0): |
|
S = S.replace("'", "") |
|
if (openParentes != closParentes): |
|
S = S.replace("(", "").replace(")", "") |
|
if (openBrackets != closBrackets): |
|
S = S.replace("[", "").replace("]", "") |
|
if (openCurBrace != closCurBrace): |
|
S = S.replace("{", "").replace("}", "") |
|
if (openAligator != closAligator): |
|
S = S.replace("<", "").replace(">", "") |
|
if (S == "") or (notAlpha(S)): |
|
return "" |
|
elif (S[-2:] == "..") and S[-3:] != "...": |
|
S = S[:-2]+"." |
|
elif (S[-2:] in [":.", ";."]): |
|
S = S[:-2]+"." |
|
elif (S[-1] not in [".", "!", "?", ":", ";"]): |
|
if (S[-1] in ["'", '"', ")", "]", "}", ">"]) and (S[-2] in [".", "!", "?", ":", ";"]): |
|
S = S[:-2]+S[-1]+S[-2] |
|
else: |
|
S = S+"." |
|
return S.replace(" ", " ").replace(" ", " ") |
|
|
|
|
|
|
|
|
|
def desambIt(token, bits, i, lastField, s, SID, tokens): |
|
def stripWord(w): |
|
start, end = 0, len(w) |
|
for j in range(len(w)): |
|
if (not w[j].isalpha()): |
|
start = j+1 |
|
else: |
|
break |
|
for j in range(start,len(w)): |
|
if (not w[j].isalpha()): |
|
end = j |
|
break |
|
return w[start:end].lower() |
|
|
|
if (token.lower() == "nos"): |
|
if (i > 0): |
|
preVERB = lex.pexists(stripWord(bits[i-1]), "VERB") or lex.pexists(stripWord(bits[i-1]), "AUX") |
|
else: |
|
preVERB = False |
|
if (i < len(bits)-1): |
|
posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX") |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
if (posNOUNDET): |
|
possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
posNOUNDET = False |
|
else: |
|
posVERB = False |
|
posNOUNDET = False |
|
if (posVERB and not posNOUNDET): |
|
tokens.append([token, lastField]) |
|
else: |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["EM","_"]) |
|
tokens.append(["OS","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Em","_"]) |
|
tokens.append(["os","_"]) |
|
else: |
|
tokens.append(["em","_"]) |
|
tokens.append(["os","_"]) |
|
|
|
elif (token.lower() == "consigo"): |
|
if (i > 0): |
|
prePRONADV = lex.pexists(stripWord(bits[i-1]), "PRON") or lex.pexists(stripWord(bits[i-1]), "ADV") |
|
else: |
|
prePRONADV = False |
|
if (i < len(bits)-1): |
|
posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX") |
|
else: |
|
posVERB = False |
|
if (i < len(bits)-2): |
|
doQue = ((bits[i+1] == "do") and (bits[i+2] == "que")) or ((bits[i+1] == "sua")) |
|
else: |
|
doQue = False |
|
if ((prePRONADV) or (posVERB)) and (not doQue): |
|
tokens.append([token, lastField]) |
|
else: |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["COM","_"]) |
|
tokens.append(["SI","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Com","_"]) |
|
tokens.append(["si","_"]) |
|
else: |
|
tokens.append(["com","_"]) |
|
tokens.append(["si","_"]) |
|
|
|
elif (token.lower() == "pra"): |
|
if (i < len(bits)-1): |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
if (posNOUNDET): |
|
possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Plur" not in feats[2]) and ("Gender=Masc" not in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
posNOUNDET = False |
|
else: |
|
posNOUNDET = False |
|
if (posNOUNDET): |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["PARA","_"]) |
|
tokens.append(["A","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Para","_"]) |
|
tokens.append(["a","_"]) |
|
else: |
|
tokens.append(["para","_"]) |
|
tokens.append(["a","_"]) |
|
else: |
|
tokens.append([token, lastField]) |
|
|
|
elif (token.lower() == "pela"): |
|
if (i < len(bits)-1): |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric() |
|
else: |
|
posNOUNDET = False |
|
properNOUNDIGIT = False |
|
if (posNOUNDET) or (properNOUNDIGIT): |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["POR","_"]) |
|
tokens.append(["A","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Por","_"]) |
|
tokens.append(["a","_"]) |
|
else: |
|
tokens.append(["por","_"]) |
|
tokens.append(["a","_"]) |
|
else: |
|
tokens.append([token, lastField]) |
|
|
|
elif (token.lower() == "pelas"): |
|
if (i < len(bits)-1): |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric() |
|
else: |
|
posNOUNDET = False |
|
properNOUNDIGIT = False |
|
if (posNOUNDET) or (properNOUNDIGIT): |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["POR","_"]) |
|
tokens.append(["AS","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Por","_"]) |
|
tokens.append(["as","_"]) |
|
else: |
|
tokens.append(["por","_"]) |
|
tokens.append(["as","_"]) |
|
else: |
|
tokens.append([token, lastField]) |
|
|
|
elif (token.lower() == "pelo"): |
|
if (i > 0): |
|
preART = lex.pexists(stripWord(bits[i-1]), "DET") |
|
if (preART): |
|
possible = lex.pget(stripWord(bits[i-1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
preART = False |
|
else: |
|
preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais") |
|
else: |
|
preART = False |
|
if (i < len(bits)-1): |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
posLower = not bits[i+1][0].isupper() |
|
if (posNOUNDET): |
|
possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
posNOUNDET = False |
|
else: |
|
posNOUNDET = False |
|
posLower = True |
|
if (preART) and (not posNOUNDET) and (posLower): |
|
tokens.append([token, lastField]) |
|
else: |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["POR","_"]) |
|
tokens.append(["O","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Por","_"]) |
|
tokens.append(["o","_"]) |
|
else: |
|
tokens.append(["por","_"]) |
|
tokens.append(["o","_"]) |
|
|
|
elif (token.lower() == "pelos"): |
|
if (i > 0): |
|
preART = lex.pexists(stripWord(bits[i-1]), "DET") |
|
if (preART): |
|
possible = lex.pget(stripWord(bits[i-1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
preART = False |
|
else: |
|
preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais") |
|
else: |
|
preART = False |
|
if (i < len(bits)-1): |
|
posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") |
|
posLower = not bits[i+1][0].isupper() |
|
if (posNOUNDET): |
|
possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") |
|
agree = False |
|
for feats in possible: |
|
if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]): |
|
agree = True |
|
break |
|
if (not agree): |
|
posNOUNDET = False |
|
else: |
|
posNOUNDET = False |
|
posLower = True |
|
if (preART) and (not posNOUNDET) and (posLower): |
|
tokens.append([token, lastField]) |
|
else: |
|
tokens.append([token, "c"+lastField]) |
|
if (token.isupper()): |
|
tokens.append(["POR","_"]) |
|
tokens.append(["OS","_"]) |
|
elif (token[0].isupper()): |
|
tokens.append(["Por","_"]) |
|
tokens.append(["os","_"]) |
|
else: |
|
tokens.append(["por","_"]) |
|
tokens.append(["os","_"]) |
|
|
|
|
|
|
|
|
|
def tokenizeIt(s, SID, outfile): |
|
removable = ["'", '"', "(", ")", "[", "]", "{", "}", "<", ">", \ |
|
"!", "?", ",", ";", ":", "=", "+", "*", "★", "|", "/", "\\", \ |
|
"&", "^", "_", "`", "'", "~", "%"] |
|
ignored = ["@", "#"] |
|
digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] |
|
contracts = {"à":["a","a"], |
|
"às":["a","as"], |
|
"ao":["a", "o"], |
|
"aos":["a", "os"], |
|
"àquela":["a", "aquela"], |
|
"àquelas":["a", "aquelas"], |
|
"àquele":["a", "aquele"], |
|
"àqueles":["a", "aqueles"], |
|
"comigo":["com", "mim"], |
|
"contigo":["com", "ti"], |
|
"consigo":["com", "si"], |
|
"conosco":["com", "nós"], |
|
"convosco":["com", "vós"], |
|
"da":["de", "a"], |
|
"das":["de", "as"], |
|
"do":["de", "o"], |
|
"dos":["de", "os"], |
|
"dali":["de", "ali"], |
|
"daqui":["de", "aqui"], |
|
"daí":["de", "aí"], |
|
"desta":["de", "esta"], |
|
"destas":["de", "estas"], |
|
"deste":["de", "este"], |
|
"destes":["de", "estes"], |
|
"dessa":["de", "essa"], |
|
"dessas":["de", "essas"], |
|
"desse":["de", "esse"], |
|
"desses":["de", "esses"], |
|
"daquela":["de", "aquela"], |
|
"daquelas":["de", "aquelas"], |
|
"daquele":["de", "aquele"], |
|
"daqueles":["de", "aqueles"], |
|
"disto":["de", "isto"], |
|
"disso":["de", "isso"], |
|
"daquilo":["de", "aquilo"], |
|
"dela":["de", "ela"], |
|
"delas":["de", "elas"], |
|
"dele":["de", "ele"], |
|
"deles":["de", "eles"], |
|
"doutra":["de", "outra"], |
|
"doutras":["de", "outras"], |
|
"doutro":["de", "outro"], |
|
"doutros":["de", "outros"], |
|
"dum":["de", "um"], |
|
"duns":["de", "uns"], |
|
"duma":["de", "uma"], |
|
"dumas":["de", "umas"], |
|
"na":["em", "a"], |
|
"nas":["em", "as"], |
|
"no":["em", "o"], |
|
"nos":["em", "os"], |
|
"nesta":["em", "esta"], |
|
"nestas":["em", "estas"], |
|
"neste":["em", "este"], |
|
"nestes":["em", "estes"], |
|
"nessa":["em", "essa"], |
|
"nessas":["em", "essas"], |
|
"nesse":["em", "esse"], |
|
"nesses":["em", "esses"], |
|
"naquela":["em", "aquela"], |
|
"naquelas":["em", "aquelas"], |
|
"naquele":["em", "aquele"], |
|
"naqueles":["em", "aqueles"], |
|
"nisto":["em", "isto"], |
|
"nisso":["em", "isso"], |
|
"naquilo":["em", "aquilo"], |
|
"nela":["em", "ela"], |
|
"nelas":["em", "elas"], |
|
"nele":["em", "ele"], |
|
"neles":["em", "eles"], |
|
"noutra":["em", "outra"], |
|
"noutras":["em", "outras"], |
|
"noutro":["em", "outro"], |
|
"noutros":["em", "outros"], |
|
"num":["em", "um"], |
|
"nuns":["em", "uns"], |
|
"numa":["em", "uma"], |
|
"numas":["em", "umas"], |
|
"pela":["por", "a"], |
|
"pelas":["por", "as"], |
|
"pelo":["por", "o"], |
|
"pelos":["por", "os"], |
|
"pra":["para", "a"], |
|
"pras":["para", "as"], |
|
"pro":["para", "o"], |
|
"pros":["para", "os"], |
|
"prum":["para", "um"], |
|
"pruns":["para", "uns"], |
|
"pruma":["para", "uma"], |
|
"prumas":["para", "umas"]} |
|
ambigous = ["nos", "consigo", "pra", "pela", "pelas", "pelo", "pelos"] |
|
|
|
enclisis = ['me', 'te', 'se', 'lhe', 'o', 'a', 'nos', 'vos', 'lhes', 'os', 'as', 'lo', 'la', 'los', 'las'] |
|
terminations = ["ia", "ias", "as", "iamos", "ieis", "iam", "ei", "as", "a", "emos", "eis", "ão"] |
|
tokens = [] |
|
bits = s.split(" ") |
|
k = 0 |
|
for b in bits: |
|
|
|
pre = [] |
|
changed = True |
|
while (changed) and (len(b) > 1): |
|
changed = False |
|
if (b[0] in removable) or ((b[0] == "$") and (b[1] in digits)) or ((b[0] == "-") and (b[1] not in digits)): |
|
pre.append(b[0]) |
|
b = b[1:] |
|
changed = True |
|
|
|
tmp = [] |
|
changed = True |
|
while (changed) and (len(b) > 1): |
|
changed = False |
|
if (b[-1] in removable+["-"]): |
|
tmp.append(b[-1]) |
|
b = b[:-1] |
|
changed = True |
|
pos = [] |
|
reticent = "" |
|
for i in range(len(tmp)-1, -1, -1): |
|
if (tmp[i] == "."): |
|
if (reticent == ""): |
|
reticent = "." |
|
elif (reticent == "."): |
|
reticent = ".." |
|
elif (reticent == ".."): |
|
pos.append("...") |
|
reticent = "" |
|
else: |
|
if (reticent != ""): |
|
pos.append(reticent) |
|
reticent = "" |
|
pos.append(tmp[i]) |
|
if (reticent != ""): |
|
pos.append(reticent) |
|
|
|
buf = b.split("-") |
|
if (len(buf) == 1): |
|
parts = pre+[b]+pos |
|
|
|
elif (len(buf) == 2) and (buf[1] in enclisis): |
|
if (buf[0][-1] == "á"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"ar", buf[1]]+pos |
|
elif (buf[0][-1] == "ê"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"er", buf[1]]+pos |
|
elif (buf[0][-1] == "í"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"ir", buf[1]]+pos |
|
elif (buf[0][-1] == "ô"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"or", buf[1]]+pos |
|
else: |
|
parts = pre+["*^*"+b, buf[0], buf[1]]+pos |
|
|
|
elif (len(buf) == 3) and (buf[1] in enclisis) \ |
|
and (buf[0][-1] == "r") and (buf[3] in [terminations]): |
|
parts = pre+["*^*"+b, buf[0]+buf[3], buf[1]]+pos |
|
|
|
elif (len(buf) == 3) and (buf[1] in enclisis) \ |
|
and (buf[0][-1] in ["á", "ê", "í", "ô"]) and (buf[3] in [terminations]): |
|
if (buf[0][-1] == "á"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"ar"+buf[3], buf[1]]+pos |
|
elif (buf[0][-1] == "ê"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"er"+buf[3], buf[1]]+pos |
|
elif (buf[0][-1] == "í"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"ir"+buf[3], buf[1]]+pos |
|
elif (buf[0][-1] == "ô"): |
|
parts = pre+["*^*"+b, buf[0][:-1]+"or"+buf[3], buf[1]]+pos |
|
else: |
|
parts = pre+[b]+pos |
|
|
|
i = 0 |
|
while (i < len(parts)): |
|
if (i == len(parts)-1): |
|
lastField = "_" |
|
else: |
|
lastField = "SpaceAfter=No" |
|
if (parts[i][:3] == "*^*"): |
|
if (i+3 == len(parts)): |
|
tokens.append([parts[i][3:], "c_"]) |
|
else: |
|
tokens.append([parts[i][3:], "cSpaceAfter=No"]) |
|
i += 1 |
|
tokens.append([parts[i], "_"]) |
|
i += 1 |
|
tokens.append([parts[i], "_"]) |
|
elif (parts[i] not in ambigous): |
|
ans = contracts.get(parts[i].lower()) |
|
if (ans == None): |
|
tokens.append([parts[i], lastField]) |
|
else: |
|
tokens.append([parts[i], "c"+lastField]) |
|
if (parts[i].isupper()): |
|
tokens.append([ans[0].upper(),"_"]) |
|
tokens.append([ans[1].upper(),"_"]) |
|
elif (parts[i][0].isupper()): |
|
tokens.append([ans[0][0].upper()+ans[0][1:],"_"]) |
|
tokens.append([ans[1],"_"]) |
|
else: |
|
tokens.append([ans[0],"_"]) |
|
tokens.append([ans[1],"_"]) |
|
else: |
|
desambIt(parts[i], bits, k, lastField, s, SID, tokens) |
|
i += 1 |
|
k += 1 |
|
|
|
print("# sent_id =", SID, file=outfile) |
|
print("# text =", s, file=outfile) |
|
|
|
toks = 0 |
|
for i in range(len(tokens)): |
|
if (tokens[i][1][0] != "c"): |
|
toks += 1 |
|
print(str(toks), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1], sep="\t", file=outfile) |
|
else: |
|
|
|
print(str(toks+1)+"-"+str(toks+2), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1][1:], sep="\t", file=outfile) |
|
print(file=outfile) |
|
return(toks) |
|
|
|
|
|
|
|
|
|
def dealWith(outfile, sent, SID, match, trim): |
|
if (trim): |
|
sent = trimIt(sent) |
|
if (match): |
|
sent = punctIt(sent) |
|
if (sent != ""): |
|
return 1, tokenizeIt(sent, SID, outfile) |
|
else: |
|
return 0, 0 |
|
|
|
|
|
|
|
|
|
def portTok(): |
|
if (len(sys.argv) == 1): |
|
arguments = ["lix.conllu", "sents2.txt", True, True, "S0000"] |
|
|
|
print("Assumindo default: 'sents.conllu' como arquivo de saída, 'sents.txt' como arquivo de entrada, correções, remoções e S0000 como sid.") |
|
else: |
|
arguments = parseOptions(sys.argv) |
|
if (arguments != None): |
|
if (arguments[0] == ""): |
|
print("Assumindo 'sents.conllu' como arquivo de saída") |
|
arguments[0] = 'sents.conllu' |
|
if (arguments[1] == []): |
|
print("Arquivo de entrada inválido - por favor corrija e tente novamente") |
|
else: |
|
outfile = open(arguments[0], "w") |
|
print("# newdoc id = {}\n# newpar".format(arguments[0]), file=outfile) |
|
infile = open(arguments[1], "r") |
|
SID = arguments[4] |
|
sTOTAL, tTOTAL = 0, 0 |
|
for line in infile: |
|
SID = nextName(SID) |
|
s, t = dealWith(outfile, line[:-1], SID, arguments[2], arguments[3]) |
|
if (s == 1): |
|
sTOTAL += 1 |
|
tTOTAL += t |
|
outfile.close() |
|
infile.close() |
|
print("Tokenização terminada com {} sentenças extraídas ({} tokens) e salvas em {}".format(sTOTAL, tTOTAL, arguments[0])) |
|
else: |
|
print("Problemas com parâmetros - por favor corrija e tente novamente") |
|
|
|
portTok() |
|
|