File size: 1,236 Bytes
275976f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import spacy
import pytextrank
from spacy.tokens import Span

# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
    def scrubber_func(span: Span) -> str:
        return span.lemma_
    return scrubber_func


# Load a spaCy model
nlp = spacy.load("en_core_web_lg")


# Exclude potential stopwords
nlp.Defaults.stop_words |= {"okay", "like"}

# Add TextRank component to pipeline with stopwords
nlp.add_pipe("textrank", config={
                        "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
                        "scrubber": {"@misc": "plural_scrubber"}})


def extract_terms(text, length):
    # Perform fact extraction on overall summary and segment summaries
    doc = nlp(text)

    if length < 200:
        # Get single most used key term
        phrases = {phrase.text for phrase in doc._.phrases[:1]}
    elif length > 200 and length < 400:
        # Create unique set from top 2 ranked phrases
        phrases = {phrase.text for phrase in doc._.phrases[:2]}
    if length > 400:
        # Create unique set from top 3 ranked phrases
        phrases = {phrase.text for phrase in doc._.phrases[:3]}

    return list(phrases)