Spaces:
Sleeping
Sleeping
import string,re,sys,os,random | |
from math import sqrt,log | |
from .anc_all_count import wordlist | |
# from .snli_freq import wordlist | |
import numpy as np | |
from nltk.stem import WordNetLemmatizer | |
from nltk import word_tokenize, pos_tag, sent_tokenize | |
lemmatizer = WordNetLemmatizer() | |
from nltk.corpus import wordnet | |
tag_dict = { | |
"J": wordnet.ADJ, | |
"N": wordnet.NOUN, | |
"V": wordnet.VERB, | |
"R": wordnet.ADV | |
} | |
# Returns the keys of dictionary d sorted by their values | |
def sort_by_value(d): | |
items=d.items() | |
backitems=[ [v[1],v[0]] for v in items] | |
backitems.sort() | |
return [ backitems[i][1] for i in range(0,len(backitems))] | |
# NDW for first z words in a sample | |
def getndwfirstz(z,lemmalist): | |
ndwfirstztype={} | |
for lemma in lemmalist[:z]: | |
ndwfirstztype[lemma]=1 | |
return len(ndwfirstztype.keys()) | |
# NDW expected random z words, 10 trials | |
def getndwerz(z,lemmalist): | |
ndwerz=0 | |
for i in range(10): | |
ndwerztype={} | |
erzlemmalist=random.sample(lemmalist,z) | |
for lemma in erzlemmalist: | |
ndwerztype[lemma]=1 | |
ndwerz+=len(ndwerztype.keys()) | |
return ndwerz/10.0 | |
# NDW expected random sequences of z words, 10 trials | |
def getndwesz(z,lemmalist): | |
ndwesz=0 | |
for i in range(10): | |
ndwesztype={} | |
startword=random.randint(0,len(lemmalist)-z) | |
eszlemmalist=lemmalist[startword:startword+z] | |
for lemma in eszlemmalist: | |
ndwesztype[lemma]=1 | |
ndwesz+=len(ndwesztype.keys()) | |
return ndwesz/10.0 | |
# MSTTR | |
def getmsttr(z,lemmalist): | |
samples=0 | |
msttr=0.0 | |
while len(lemmalist)>=z: | |
samples+=1 | |
msttrtype={} | |
for lemma in lemmalist[:z]: | |
msttrtype[lemma]=1 | |
msttr+=len(msttrtype.keys())/float(z) | |
lemmalist=lemmalist[z:] | |
return msttr/samples | |
def isLetterNumber(character): | |
if character in string.printable and not character in string.punctuation: | |
return 1 | |
return 0 | |
def isSentence(line): | |
for character in line: | |
if isLetterNumber(character): | |
return 1 | |
return 0 | |
# reads information from bnc wordlist | |
adjdict={} | |
verbdict={} | |
noundict={} | |
worddict={} | |
wordlist = wordlist.split('\n') | |
# wordlistfile=open("anc_all_count.txt","r") | |
# wordlist=wordlistfile.readlines() | |
# wordlistfile.close() | |
for word in wordlist: | |
wordinfo=word.strip() | |
if not wordinfo or "Total words" in wordinfo: | |
continue | |
infolist=wordinfo.split() | |
lemma=infolist[1] | |
pos=infolist[2] | |
frequency=int(infolist[3]) | |
worddict[lemma]=worddict.get(lemma,0)+frequency | |
if pos[0]=="J": | |
adjdict[lemma]=adjdict.get(lemma,0)+frequency | |
elif pos[0]=="V": | |
verbdict[lemma]=verbdict.get(lemma,0)+frequency | |
elif pos[0]=="N": | |
noundict[lemma]=noundict.get(lemma,0)+frequency | |
wordranks=sort_by_value(worddict) | |
verbranks=sort_by_value(verbdict) | |
# standard can be computed as the 20-th percentile of word counts: | |
# lengths = [len(word_tokenize(x)) for x in data['train']['text']] | |
# standard = int(np.percentile(lengths, 20)) | |
def lca(input_text, standard = 5): | |
lemlines = sent_tokenize(input_text) | |
for i in range(len(lemlines)): | |
morph = pos_tag(word_tokenize(lemlines[i])) | |
lemlines[i] = " ".join(["{}_{}".format( | |
lemmatizer.lemmatize(word, tag_dict[tag[0]] if tag[0] in tag_dict else 'n'), | |
tag) | |
for word, tag in morph]) | |
# process input file | |
wordtypes={} | |
wordtokens=0 | |
swordtypes={} | |
swordtokens=0 | |
lextypes={} | |
lextokens=0 | |
slextypes={} | |
slextokens=0 | |
verbtypes={} | |
verbtokens=0 | |
sverbtypes={} | |
adjtypes={} | |
adjtokens=0 | |
advtypes={} | |
advtokens=0 | |
nountypes={} | |
nountokens=0 | |
lemmaposlist=[] | |
lemmalist=[] | |
for lemline in lemlines: | |
lemline=lemline.strip() | |
lemline=lemline.lower() | |
if not isSentence(lemline): | |
continue | |
lemmas=lemline.split() | |
for lemma in lemmas: | |
word=lemma.split("_")[0] | |
pos=lemma.split("_")[-1] | |
if (not pos in string.punctuation) and pos!="sent" and pos!="sym": | |
lemmaposlist.append(lemma) | |
lemmalist.append(word) | |
wordtokens+=1 | |
wordtypes[word]=1 | |
if (not word in wordranks[-2000:]) and pos!="cd": | |
swordtypes[word]=1 | |
swordtokens+=1 | |
if pos[0]=="n": | |
lextypes[word]=1 | |
nountypes[word]=1 | |
lextokens+=1 | |
nountokens+=1 | |
if not word in wordranks[-2000:]: | |
slextypes[word]=1 | |
slextokens+=1 | |
elif pos[0]=="j": | |
lextypes[word]=1 | |
adjtypes[word]=1 | |
lextokens+=1 | |
adjtokens+=1 | |
if not word in wordranks[-2000:]: | |
slextypes[word]=1 | |
slextokens+=1 | |
elif pos[0]=="r" and (word in adjdict or (word[-2:]=="ly" and word[:-2] in adjdict)): | |
lextypes[word]=1 | |
advtypes[word]=1 | |
lextokens+=1 | |
advtokens+=1 | |
if not word in wordranks[-2000:]: | |
slextypes[word]=1 | |
slextokens+=1 | |
elif pos[0]=="v" and not word in ["be","have"]: | |
verbtypes[word]=1 | |
verbtokens+=1 | |
lextypes[word]=1 | |
lextokens+=1 | |
if not word in wordranks[-2000:]: | |
sverbtypes[word]=1 | |
slextypes[word]=1 | |
slextokens+=1 | |
# 1. lexical density | |
ld=float(lextokens)/max(wordtokens, 1) | |
# 2. lexical sophistication | |
# 2.1 lexical sophistication | |
ls1=slextokens/max(1,float(lextokens)) | |
ls2=len(swordtypes.keys())/max(float(len(wordtypes.keys())), 1) | |
# 2.2 verb sophistication | |
if verbtokens == 0: | |
vs1 = 0 | |
vs2 = 0 | |
cvs1 = 0 | |
else: | |
vs1=len(sverbtypes.keys())/float(verbtokens) | |
vs2=(len(sverbtypes.keys())*len(sverbtypes.keys()))/float(verbtokens) | |
cvs1=len(sverbtypes.keys())/sqrt(2*verbtokens) | |
# 3 lexical diversity or variation | |
# 3.1 NDW, may adjust the values of "standard" | |
ndw=ndwz=ndwerz=ndwesz=len(wordtypes.keys()) | |
if len(lemmalist)>=standard: | |
ndwz=getndwfirstz(standard,lemmalist) | |
ndwerz=getndwerz(standard,lemmalist) | |
ndwesz=getndwesz(standard,lemmalist) | |
# 3.2 TTR | |
msttr=ttr=len(wordtypes.keys())/max(float(wordtokens), 1) | |
if len(lemmalist)>=standard: | |
msttr=getmsttr(standard,lemmalist) | |
cttr=len(wordtypes.keys())/max(sqrt(2*wordtokens), 1) | |
rttr=len(wordtypes.keys())/max(sqrt(wordtokens), 1) | |
logttr=log(len(wordtypes.keys()))/log(wordtokens) if wordtokens > 1 else 0 | |
if wordtokens == len(wordtypes.keys()): | |
uber = 0 | |
else: | |
uber=(log(wordtokens,10)*log(wordtokens,10))/log(wordtokens/float(len(wordtypes.keys())),10) | |
# 3.3 verb diversity | |
if verbtokens == 0: | |
vv1, svv1, cvv1 = 0, 0, 0 | |
else: | |
vv1=len(verbtypes.keys())/float(verbtokens) | |
svv1=len(verbtypes.keys())*len(verbtypes.keys())/float(verbtokens) | |
cvv1=len(verbtypes.keys())/sqrt(2*verbtokens) | |
# 3.4 lexical diversity | |
lv=len(lextypes.keys())/max(1,float(lextokens)) | |
vv2=len(verbtypes.keys())/max(1,float(lextokens)) | |
nv=len(nountypes.keys())/max(1,float(nountokens)) | |
adjv=len(adjtypes.keys())/max(1,float(lextokens)) | |
advv=len(advtypes.keys())/max(1,float(lextokens)) | |
modv=(len(advtypes.keys())+len(adjtypes.keys()))/max(1,float(lextokens)) | |
output = [len(wordtypes.keys()), len(swordtypes.keys()), len(lextypes.keys()), | |
len(slextypes.keys()), | |
wordtokens, swordtokens, lextokens, slextokens, | |
ld, ls1, ls2, vs1, vs2, cvs1, ndw, ndwz, ndwerz, ndwesz, ttr, msttr, cttr, | |
rttr, logttr, uber, lv, vv1, svv1, cvv1, vv2, nv, adjv, advv, modv] | |
return output | |