ACL2Vec / _arxiv.py
4kasha
initial commit
bcdda34
from __future__ import annotations
import logging
from typing import Optional
import string
import nltk
import arxiv
logger = logging.getLogger(__name__)
def extract_title_abst(arxiv_id: str):
try:
paper = next(arxiv.Search(id_list=[arxiv_id]).results())
doc = paper.title + ' ' + paper.summary
except:
doc = None
return doc
def doc_to_ids(
doc: Optional[str],
word_to_id_: dict[str, int],
stemming: bool,
lower: bool = True,
):
from nltk.stem.porter import PorterStemmer
if not doc:
y = []
else:
if lower:
doc = doc.lower()
doc = "".join([char for char in doc if char not in string.punctuation])
words = nltk.word_tokenize(doc)
if stemming:
porter = PorterStemmer()
words = [porter.stem(word) for word in words]
# Consider out-of-vocabulary cases, if y == []: no matched results
y = [word_to_id_[word] for word in words if word in word_to_id_]
# pick up keywords only once
#y = list(set(y))
return y