LingConv / compute_lng.py
mohdelgaar's picture
Initial commit
20b7679
raw
history blame
1.89 kB
from lng.lca.lc_anc import lca
from lng.L2SCA.analyzeText import sca
import lftk
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_lingfeat(text):
from lingfeat import extractor
LingFeat = extractor.pass_text(text)
LingFeat.preprocess()
d = {}
d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features
d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features
d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features
# Discourse (Disco) Features
d.update(LingFeat.EnDF_()) # Entity Density Features
d.update(LingFeat.EnGF_()) # Entity Grid Features
# Syntactic (Synta) Features
# d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza)
# d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza)
d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features
# Lexico Semantic (LxSem) Features
d.update(LingFeat.TTRF_()) # Type Token Ratio Features
d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features
d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman)
d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS)
# Shallow Traditional (ShTra) Features
d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens)
d.update(LingFeat.TraF_()) # Traditional Formulas
return list(d.values())
def extract_lftk(text):
if text == '':
return [0.] * 220
doc = nlp(text)
LFTK = lftk.Extractor(doc)
feats = LFTK.extract()
return list(feats.values())
def compute_lng(text, shortcut = False):
lca_feats = lca(text)
if shortcut:
sca_feats = [0] * 23
else:
sca_feats = sca(text)
lftk = extract_lftk(text)
all_feats = lca_feats + sca_feats + lftk
return all_feats