Spaces:
Sleeping
Sleeping
File size: 1,893 Bytes
20b7679 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from lng.lca.lc_anc import lca
from lng.L2SCA.analyzeText import sca
import lftk
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_lingfeat(text):
from lingfeat import extractor
LingFeat = extractor.pass_text(text)
LingFeat.preprocess()
d = {}
d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features
d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features
d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features
# Discourse (Disco) Features
d.update(LingFeat.EnDF_()) # Entity Density Features
d.update(LingFeat.EnGF_()) # Entity Grid Features
# Syntactic (Synta) Features
# d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza)
# d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza)
d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features
# Lexico Semantic (LxSem) Features
d.update(LingFeat.TTRF_()) # Type Token Ratio Features
d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features
d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman)
d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS)
# Shallow Traditional (ShTra) Features
d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens)
d.update(LingFeat.TraF_()) # Traditional Formulas
return list(d.values())
def extract_lftk(text):
if text == '':
return [0.] * 220
doc = nlp(text)
LFTK = lftk.Extractor(doc)
feats = LFTK.extract()
return list(feats.values())
def compute_lng(text, shortcut = False):
lca_feats = lca(text)
if shortcut:
sca_feats = [0] * 23
else:
sca_feats = sca(text)
lftk = extract_lftk(text)
all_feats = lca_feats + sca_feats + lftk
return all_feats
|