Spaces:

mohdelgaar
/

LingConv

Sleeping

App Files Files Community

LingConv / compute_lng.py

mohdelgaar

Initial commit

20b7679 11 months ago

raw

history blame

1.89 kB

	from lng.lca.lc_anc import lca
	from lng.L2SCA.analyzeText import sca
	import lftk
	import spacy
	nlp = spacy.load("en_core_web_sm")

	def extract_lingfeat(text):
	from lingfeat import extractor
	LingFeat = extractor.pass_text(text)
	LingFeat.preprocess()

	d = {}
	d.update(LingFeat.WoKF_()) # Wikipedia Knowledge Features
	d.update(LingFeat.WBKF_()) # WeeBit Corpus Knowledge Features
	d.update(LingFeat.OSKF_()) # OneStopEng Corpus Knowledge Features

	# Discourse (Disco) Features
	d.update(LingFeat.EnDF_()) # Entity Density Features
	d.update(LingFeat.EnGF_()) # Entity Grid Features

	# Syntactic (Synta) Features
	# d.update(LingFeat.PhrF_()) # Noun/Verb/Adj/Adv/... Phrasal Features (logging stanza)
	# d.update(LingFeat.TrSF_()) # (Parse) Tree Structural Features (logging stanza)
	d.update(LingFeat.POSF_()) # Noun/Verb/Adj/Adv/... Part-of-Speech Features

	# Lexico Semantic (LxSem) Features
	d.update(LingFeat.TTRF_()) # Type Token Ratio Features
	d.update(LingFeat.VarF_()) # Noun/Verb/Adj/Adv Variation Features
	d.update(LingFeat.PsyF_()) # Psycholinguistic Difficulty of Words (AoA Kuperman)
	d.update(LingFeat.WorF_()) # Word Familiarity from Frequency Count (SubtlexUS)

	# Shallow Traditional (ShTra) Features
	d.update(LingFeat.ShaF_()) # Shallow Features (e.g. avg number of tokens)
	d.update(LingFeat.TraF_()) # Traditional Formulas

	return list(d.values())


	def extract_lftk(text):
	if text == '':
	return [0.] * 220
	doc = nlp(text)
	LFTK = lftk.Extractor(doc)

	feats = LFTK.extract()
	return list(feats.values())

	def compute_lng(text, shortcut = False):
	lca_feats = lca(text)
	if shortcut:
	sca_feats = [0] * 23
	else:
	sca_feats = sca(text)
	lftk = extract_lftk(text)
	all_feats = lca_feats + sca_feats + lftk

	return all_feats