from __future__ import annotations import logging from typing import Optional import string import nltk import arxiv logger = logging.getLogger(__name__) def extract_title_abst(arxiv_id: str): try: paper = next(arxiv.Search(id_list=[arxiv_id]).results()) doc = paper.title + ' ' + paper.summary except: doc = None return doc def doc_to_ids( doc: Optional[str], word_to_id_: dict[str, int], stemming: bool, lower: bool = True, ): from nltk.stem.porter import PorterStemmer if not doc: y = [] else: if lower: doc = doc.lower() doc = "".join([char for char in doc if char not in string.punctuation]) words = nltk.word_tokenize(doc) if stemming: porter = PorterStemmer() words = [porter.stem(word) for word in words] # Consider out-of-vocabulary cases, if y == []: no matched results y = [word_to_id_[word] for word in words if word in word_to_id_] # pick up keywords only once #y = list(set(y)) return y