Spaces:
Paused
Paused
File size: 1,818 Bytes
9680844 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import spacy
import pytextrank
from spacy.tokens import Span
# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
def scrubber_func(span: Span) -> str:
return span.lemma_
return scrubber_func
def model_selector(target_language: str):
# Load subset of non-english models
language_model = {
"spa": "es_core_news_sm",
"fra": "fr_core_news_sm",
"pol": "pl_core_news_sm",
"deu": "de_core_news_sm",
"ita": "it_core_news_sm",
"por": "pt_core_news_sm",
"nld": "nl_core_news_sm",
"fin": "fi_core_news_sm",
"ron": "ro_core_news_sm",
"rus": "ru_core_news_sm"
}
try:
nlp = spacy.load(language_model[target_language])
except KeyError:
# Load a spaCy English model
nlp = spacy.load("en_core_web_lg")
# Add TextRank component to pipeline with stopwords
nlp.add_pipe("textrank", config={
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
"scrubber": {"@misc": "plural_scrubber"}})
return nlp
def extract_terms(text, target_language, length):
nlp = model_selector(target_language)
# Perform fact extraction on overall summary and segment summaries
doc = nlp(text)
if length < 100:
# Get single most used key term
phrases = {phrase.text for phrase in doc._.phrases[:1]}
elif length > 100 and length < 300:
# Create unique set from top 2 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:2]}
if length > 300:
# Create unique set from top 3 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:3]}
return list(phrases)
|