Spaces:

colonelwatch
/

abstracts-index

Running on Zero

App Files Files

colonelwatch commited on Nov 17, 2024

Commit

9cee2a5

1 Parent(s): f5e5f81

Roll parsing into Work dataclass classmethod

Browse files

Files changed (1) hide show

app.py +111 -58

app.py CHANGED Viewed

@@ -6,9 +6,10 @@ from sentence_transformers import SentenceTransformer
 import faiss
 import gradio as gr
 from datasets import Dataset
-from typing import TypedDict
 import json
 from pathlib import Path
 from markdown_it import MarkdownIt # used for overriding default markdown renderer
@@ -23,6 +24,84 @@ class IndexParameters(TypedDict):
     param_string: str  # pass directly to faiss index
 def get_model(model_name: str, device: str) -> SentenceTransformer:
     return SentenceTransformer(model_name, device=device)
@@ -45,17 +124,6 @@ def get_index(dir: Path, search_time_s: float) -> Dataset:
     return index
-def recover_abstract(inverted_index: dict[str, list[int]]) -> str:
-    abstract_size = max([max(appearances) for appearances in inverted_index.values()])+1
-    abstract_words: list[str | None] = [None] * abstract_size
-    for word, appearances in inverted_index.items():
-        for appearance in appearances:
-            abstract_words[appearance] = word
-    return ' '.join(word for word in abstract_words if word is not None)
 def execute_request(request_str):
     response = requests.get(request_str).json()
     return response
@@ -66,69 +134,54 @@ def format_response(neighbors, response):
     result_string = ''
     for distance, openalex_id in neighbors:
-        doc = response[openalex_id]
-        # collect attributes from openalex doc for the given openalex_id
-        title = doc['title']
-        # abstract = _recover_abstract(doc['abstract_inverted_index'])
-        abstract_inverted_index = doc['abstract_inverted_index']
-        author_names = [authorship['author']['display_name'] for authorship in doc['authorships']]
-        # journal_name = doc['primary_location']['source']['display_name']
-        publication_year = doc['publication_year']
-        citation_count = doc['cited_by_count']
-        doi = doc['doi']
-        if title is None: # edge case: no title
-            title = 'No title'
-        if abstract_inverted_index is None: # edge case: no abstract
-            abstract = 'No abstract'
-        else:
-            abstract = recover_abstract(abstract_inverted_index)
-            abstract = abstract.replace('\n', '\\n').replace('\r', '\\r')
-        # try to get journal name or else set it to None
-        try:
-            journal_name = doc['primary_location']['source']['display_name']
-        except (TypeError, KeyError):
-            journal_name = None
         # title: knock out escape sequences
-        title = title.replace('\n', '\\n').replace('\r', '\\r')
-        # abstract: knock out escape sequences, then truncate to 1500 characters if necessary
-        abstract = abstract.replace('\n', '\\n').replace('\r', '\\r')
-        if len(abstract) > 2000:
-            abstract = abstract[:2000] + '...'
-        # authors: cover no name edge case, truncate to 3 authors if necessary
-        author_names = [author_name if author_name else 'No name' for author_name in author_names]
-        if len(author_names) >= 3:
-            authors_str = ', '.join(author_names[:3]) + ', ...'
-        else:
-            authors_str = ', '.join(author_names)
         entry_string = ''
-        if doi: # edge case: for now, no doi -> no link
-            entry_string += f'## [{title}]({doi})\n\n'
         else:
             entry_string += f'## {title}\n\n'
-        if journal_name:
-            entry_string += f'**{authors_str} - {journal_name}, {publication_year}**\n\n'
         else:
-            entry_string += f'**{authors_str}, {publication_year}**\n\n'
         entry_string += f'{abstract}\n\n'
-        if citation_count: # edge case: we shouldn't tack "Cited-by count: 0" onto someone's paper
-            entry_string += f'*Cited-by count: {citation_count}*'
             entry_string += '&nbsp;&nbsp;&nbsp;&nbsp;'
-        if doi: # list the doi if it exists
-            entry_string += f'*DOI: {doi.replace("https://doi.org/", "")}*'
             entry_string += '&nbsp;&nbsp;&nbsp;&nbsp;'
         entry_string += f'*Similarity: {distance:.2f}*'

 import faiss
 import gradio as gr
 from datasets import Dataset
+from typing import TypedDict, Self, Any
 import json
 from pathlib import Path
+from dataclasses import dataclass
 from markdown_it import MarkdownIt # used for overriding default markdown renderer
     param_string: str  # pass directly to faiss index
+@dataclass
+class Work:
+    title: str | None
+    abstract: str | None  # recovered from abstract_inverted_index
+    authors: list[str]  # takes raw_author_name field from Authorship objects
+    journal_name: str | None  # takes the display_name field of the first location
+    year: int
+    citations: int
+    doi: str | None
+    def __post_init__(self):
+        self._check_type(self.title, str, nullable=True)
+        self._check_type(self.abstract, str, nullable=True)
+        self._check_type(self.authors, list)
+        for author in self.authors:
+            self._check_type(author, str)
+        self._check_type(self.journal_name, str, nullable=True)
+        self._check_type(self.year, int)
+        self._check_type(self.citations, int)
+        self._check_type(self.doi, str, nullable=True)
+    @classmethod
+    def from_dict(cls, d: dict) -> Self:
+        try:
+            inverted_index: dict[str, list[int]] = d["abstract_inverted_index"]
+        except KeyError:
+            abstract = None
+        else:
+            abstract = cls._recover_abstract(inverted_index)
+        try:
+            journal_name = d['primary_location']['source']['display_name']
+        except (TypeError, KeyError):  # key didn't exist or a value was null
+            journal_name = None
+        return cls(
+            title=d["title"],
+            abstract=abstract,
+            authors=[authorship["raw_author_name"] for authorship in d["authorships"]],
+            journal_name=journal_name,
+            year=d["publication_year"],
+            citations=d["cited_by_count"],
+            doi=d["doi"],
+        )
+    @staticmethod
+    def raw_fields() -> list[str]:
+        return [
+            "title",
+            "abstract_inverted_index",
+            "authorships",
+            "primary_location",
+            "publication_year",
+            "cited_by_count",
+            "doi"
+        ]
+    @staticmethod
+    def _check_type(v: Any, t: type, nullable: bool = False):
+        if not ((nullable and v is None) or isinstance(v, t)):
+            v_type_name = f"{type(v)}" if v is not None else "None"
+            t_name = f"{t}"
+            if nullable:
+                t_name += " | None"
+            raise ValueError(f"expected {t_name}, got {v_type_name}")
+    @staticmethod
+    def _recover_abstract(inverted_index: dict[str, list[int]]) -> str:
+        abstract_size = max(max(locs) for locs in inverted_index.values())+1
+        abstract_words: list[str | None] = [None] * abstract_size
+        for word, locs in inverted_index.items():
+            for loc in locs:
+                abstract_words[loc] = word
+        return ' '.join(word for word in abstract_words if word is not None)
 def get_model(model_name: str, device: str) -> SentenceTransformer:
     return SentenceTransformer(model_name, device=device)
     return index
 def execute_request(request_str):
     response = requests.get(request_str).json()
     return response
     result_string = ''
     for distance, openalex_id in neighbors:
+        work = Work.from_dict(response[openalex_id])
+        # edge cases: no title
+        abstract = work.abstract if work.abstract is not None else "No abstract"
         # title: knock out escape sequences
+        if work.title is not None:
+            title = work.title.replace('\n', '\\n').replace('\r', '\\r')
+        else:  # edge case: no title
+            title = "No title"
+        # abstract: knock out escape sequences, then truncate to 2000 chars
+        if work.abstract:
+            abstract = work.abstract.replace('\n', '\\n').replace('\r', '\\r')
+            if len(abstract) > 2000:
+                abstract = abstract[:2000] + '...'
+        else:  # edge case: no abstract
+            abstract = "No abstract"
+        # authors: truncate to 3 authors if necessary
+        if len(work.authors) >= 3:
+            authors_str = ', '.join(work.authors[:3]) + ', ...'
+        elif work.authors:
+            authors_str = ', '.join(work.authors)
+        else:  # edge case: no authors
+            authors_str = "No author"
         entry_string = ''
+        if work.doi:  # edge case: for now, no doi -> no link
+            entry_string += f'## [{title}]({work.doi})\n\n'
         else:
             entry_string += f'## {title}\n\n'
+        if work.journal_name:
+            entry_string += f'**{authors_str} - {work.journal_name}, {work.year}**\n\n'
         else:
+            entry_string += f'**{authors_str}, {work.year}**\n\n'
         entry_string += f'{abstract}\n\n'
+        # edge case: we shouldn't tack "Cited-by count: 0" onto someone's paper
+        if work.citations:
+            entry_string += f'*Cited-by count: {work.citations}*'
             entry_string += '&nbsp;&nbsp;&nbsp;&nbsp;'
+        if work.doi:  # list the doi if it exists
+            entry_string += f'*DOI: {work.doi.replace("https://doi.org/", "")}*'
             entry_string += '&nbsp;&nbsp;&nbsp;&nbsp;'
         entry_string += f'*Similarity: {distance:.2f}*'