Spaces:
Paused
Paused
File size: 1,236 Bytes
275976f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import spacy
import pytextrank
from spacy.tokens import Span
# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
def scrubber_func(span: Span) -> str:
return span.lemma_
return scrubber_func
# Load a spaCy model
nlp = spacy.load("en_core_web_lg")
# Exclude potential stopwords
nlp.Defaults.stop_words |= {"okay", "like"}
# Add TextRank component to pipeline with stopwords
nlp.add_pipe("textrank", config={
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
"scrubber": {"@misc": "plural_scrubber"}})
def extract_terms(text, length):
# Perform fact extraction on overall summary and segment summaries
doc = nlp(text)
if length < 200:
# Get single most used key term
phrases = {phrase.text for phrase in doc._.phrases[:1]}
elif length > 200 and length < 400:
# Create unique set from top 2 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:2]}
if length > 400:
# Create unique set from top 3 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:3]}
return list(phrases) |