File size: 1,102 Bytes
bcdda34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from __future__ import annotations
import logging
from typing import Optional
import string
import nltk
import arxiv
logger = logging.getLogger(__name__)
def extract_title_abst(arxiv_id: str):
try:
paper = next(arxiv.Search(id_list=[arxiv_id]).results())
doc = paper.title + ' ' + paper.summary
except:
doc = None
return doc
def doc_to_ids(
doc: Optional[str],
word_to_id_: dict[str, int],
stemming: bool,
lower: bool = True,
):
from nltk.stem.porter import PorterStemmer
if not doc:
y = []
else:
if lower:
doc = doc.lower()
doc = "".join([char for char in doc if char not in string.punctuation])
words = nltk.word_tokenize(doc)
if stemming:
porter = PorterStemmer()
words = [porter.stem(word) for word in words]
# Consider out-of-vocabulary cases, if y == []: no matched results
y = [word_to_id_[word] for word in words if word in word_to_id_]
# pick up keywords only once
#y = list(set(y))
return y |