from lng.lca.lc_anc import lca | |
from lng.L2SCA.analyzeText import sca | |
import lftk | |
import spacy | |
nlp = spacy.load("en_core_web_sm") | |
def extract_lingfeat(text): | |
from lingfeat import extractor | |
LingFeat = extractor.pass_text(text) | |
LingFeat.preprocess() | |
d = {} | |
d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features | |
d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features | |
d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features | |
# Discourse (Disco) Features | |
d.update(LingFeat.EnDF_()) # Entity Density Features | |
d.update(LingFeat.EnGF_()) # Entity Grid Features | |
# Syntactic (Synta) Features | |
# d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza) | |
# d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza) | |
d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features | |
# Lexico Semantic (LxSem) Features | |
d.update(LingFeat.TTRF_()) # Type Token Ratio Features | |
d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features | |
d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman) | |
d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS) | |
# Shallow Traditional (ShTra) Features | |
d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens) | |
d.update(LingFeat.TraF_()) # Traditional Formulas | |
return list(d.values()) | |
def extract_lftk(text): | |
if text == '': | |
return [0.] * 220 | |
doc = nlp(text) | |
LFTK = lftk.Extractor(doc) | |
feats = LFTK.extract() | |
return list(feats.values()) | |
def compute_lng(text, shortcut = False): | |
lca_feats = lca(text) | |
if shortcut: | |
sca_feats = [0] * 23 | |
else: | |
sca_feats = sca(text) | |
lftk = extract_lftk(text) | |
all_feats = lca_feats + sca_feats + lftk | |
return all_feats | |