File size: 1,102 Bytes
bcdda34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from __future__ import annotations

import logging
from typing import Optional
import string
import nltk
import arxiv

logger = logging.getLogger(__name__)

def extract_title_abst(arxiv_id: str):
    try:
        paper = next(arxiv.Search(id_list=[arxiv_id]).results())
        doc = paper.title + ' ' + paper.summary
    except:
        doc = None
    return doc

def doc_to_ids(
        doc: Optional[str],
        word_to_id_: dict[str, int],
        stemming: bool,
        lower: bool = True,
    ):
    from nltk.stem.porter import PorterStemmer

    if not doc:
        y = []
    else:
        if lower:
            doc = doc.lower()
        doc = "".join([char for char in doc if char not in string.punctuation])
        words = nltk.word_tokenize(doc)
        if stemming:
            porter = PorterStemmer()
            words = [porter.stem(word) for word in words]

        # Consider out-of-vocabulary cases, if y == []: no matched results
        y = [word_to_id_[word] for word in words if word in word_to_id_]
        # pick up keywords only once
        #y = list(set(y))
    return y