from lng.lca.lc_anc import lca from lng.L2SCA.analyzeText import sca import lftk import spacy nlp = spacy.load("en_core_web_sm") def extract_lingfeat(text): from lingfeat import extractor LingFeat = extractor.pass_text(text) LingFeat.preprocess() d = {} d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features # Discourse (Disco) Features d.update(LingFeat.EnDF_()) # Entity Density Features d.update(LingFeat.EnGF_()) # Entity Grid Features # Syntactic (Synta) Features # d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza) # d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza) d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features # Lexico Semantic (LxSem) Features d.update(LingFeat.TTRF_()) # Type Token Ratio Features d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman) d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS) # Shallow Traditional (ShTra) Features d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens) d.update(LingFeat.TraF_()) # Traditional Formulas return list(d.values()) def extract_lftk(text): if text == '': return [0.] * 220 doc = nlp(text) LFTK = lftk.Extractor(doc) feats = LFTK.extract() return list(feats.values()) def compute_lng(text, shortcut = False): lca_feats = lca(text) if shortcut: sca_feats = [0] * 23 else: sca_feats = sca(text) lftk = extract_lftk(text) all_feats = lca_feats + sca_feats + lftk return all_feats