File size: 1,893 Bytes
20b7679
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from lng.lca.lc_anc import lca
from lng.L2SCA.analyzeText import sca
import lftk
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_lingfeat(text):
    from lingfeat import extractor
    LingFeat = extractor.pass_text(text)
    LingFeat.preprocess()
    
    d = {}
    d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features
    d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features
    d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features

    # Discourse (Disco) Features
    d.update(LingFeat.EnDF_()) # Entity Density Features
    d.update(LingFeat.EnGF_()) # Entity Grid Features

    # Syntactic (Synta) Features
    # d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza)
    # d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza)
    d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features

    # Lexico Semantic (LxSem) Features
    d.update(LingFeat.TTRF_()) # Type Token Ratio Features
    d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features 
    d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman)
    d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS)

    # Shallow Traditional (ShTra) Features
    d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens)
    d.update(LingFeat.TraF_()) # Traditional Formulas 
    
    return list(d.values())


def extract_lftk(text):
    if text == '':
        return [0.] * 220
    doc = nlp(text)
    LFTK = lftk.Extractor(doc)
    
    feats = LFTK.extract()
    return list(feats.values())

def compute_lng(text, shortcut = False):
    lca_feats = lca(text)
    if shortcut:
        sca_feats = [0] * 23
    else:
        sca_feats = sca(text)
    lftk = extract_lftk(text)
    all_feats = lca_feats + sca_feats + lftk

    return all_feats