Spaces:
Paused
Paused
import spacy | |
import pytextrank | |
from spacy.tokens import Span | |
# Define decorator for converting to singular version of words | |
def plural_scrubber(): | |
def scrubber_func(span: Span) -> str: | |
return span.lemma_ | |
return scrubber_func | |
def model_selector(target_language: str): | |
# Load subset of non-english models | |
language_model = { | |
"spa": "es_core_news_sm", | |
"fra": "fr_core_news_sm", | |
"pol": "pl_core_news_sm", | |
"deu": "de_core_news_sm", | |
"ita": "it_core_news_sm", | |
"por": "pt_core_news_sm", | |
"nld": "nl_core_news_sm", | |
"fin": "fi_core_news_sm", | |
"ron": "ro_core_news_sm", | |
"rus": "ru_core_news_sm" | |
} | |
try: | |
print("TARGET LANG:", language_model[target_language]) | |
nlp = spacy.load(language_model[target_language]) | |
except KeyError: | |
# Load a spaCy English model | |
nlp = spacy.load("en_core_web_lg") | |
# # Exclude additional common spoken stopwords | |
# nlp.Defaults.stop_words |= {"okay", "like", "uhm"} | |
# Add TextRank component to pipeline with stopwords | |
nlp.add_pipe("textrank", config={ | |
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words}, | |
"scrubber": {"@misc": "plural_scrubber"}}) | |
return nlp | |
def extract_terms(text, target_language, length): | |
nlp = model_selector(target_language) | |
# Perform fact extraction on overall summary and segment summaries | |
doc = nlp(text) | |
if length < 100: | |
# Get single most used key term | |
phrases = {phrase.text for phrase in doc._.phrases[:1]} | |
elif length > 100 and length < 300: | |
# Create unique set from top 2 ranked phrases | |
phrases = {phrase.text for phrase in doc._.phrases[:2]} | |
if length > 300: | |
# Create unique set from top 3 ranked phrases | |
phrases = {phrase.text for phrase in doc._.phrases[:3]} | |
return list(phrases) | |