Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
9cee2a5
1
Parent(s):
f5e5f81
Roll parsing into Work dataclass classmethod
Browse files
app.py
CHANGED
@@ -6,9 +6,10 @@ from sentence_transformers import SentenceTransformer
|
|
6 |
import faiss
|
7 |
import gradio as gr
|
8 |
from datasets import Dataset
|
9 |
-
from typing import TypedDict
|
10 |
import json
|
11 |
from pathlib import Path
|
|
|
12 |
|
13 |
from markdown_it import MarkdownIt # used for overriding default markdown renderer
|
14 |
|
@@ -23,6 +24,84 @@ class IndexParameters(TypedDict):
|
|
23 |
param_string: str # pass directly to faiss index
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def get_model(model_name: str, device: str) -> SentenceTransformer:
|
27 |
return SentenceTransformer(model_name, device=device)
|
28 |
|
@@ -45,17 +124,6 @@ def get_index(dir: Path, search_time_s: float) -> Dataset:
|
|
45 |
return index
|
46 |
|
47 |
|
48 |
-
def recover_abstract(inverted_index: dict[str, list[int]]) -> str:
|
49 |
-
abstract_size = max([max(appearances) for appearances in inverted_index.values()])+1
|
50 |
-
|
51 |
-
abstract_words: list[str | None] = [None] * abstract_size
|
52 |
-
for word, appearances in inverted_index.items():
|
53 |
-
for appearance in appearances:
|
54 |
-
abstract_words[appearance] = word
|
55 |
-
|
56 |
-
return ' '.join(word for word in abstract_words if word is not None)
|
57 |
-
|
58 |
-
|
59 |
def execute_request(request_str):
|
60 |
response = requests.get(request_str).json()
|
61 |
return response
|
@@ -66,69 +134,54 @@ def format_response(neighbors, response):
|
|
66 |
|
67 |
result_string = ''
|
68 |
for distance, openalex_id in neighbors:
|
69 |
-
|
70 |
-
|
71 |
-
# collect attributes from openalex doc for the given openalex_id
|
72 |
-
title = doc['title']
|
73 |
-
# abstract = _recover_abstract(doc['abstract_inverted_index'])
|
74 |
-
abstract_inverted_index = doc['abstract_inverted_index']
|
75 |
-
author_names = [authorship['author']['display_name'] for authorship in doc['authorships']]
|
76 |
-
# journal_name = doc['primary_location']['source']['display_name']
|
77 |
-
publication_year = doc['publication_year']
|
78 |
-
citation_count = doc['cited_by_count']
|
79 |
-
doi = doc['doi']
|
80 |
-
|
81 |
-
if title is None: # edge case: no title
|
82 |
-
title = 'No title'
|
83 |
-
|
84 |
-
if abstract_inverted_index is None: # edge case: no abstract
|
85 |
-
abstract = 'No abstract'
|
86 |
-
else:
|
87 |
-
abstract = recover_abstract(abstract_inverted_index)
|
88 |
-
abstract = abstract.replace('\n', '\\n').replace('\r', '\\r')
|
89 |
|
90 |
-
#
|
91 |
-
|
92 |
-
journal_name = doc['primary_location']['source']['display_name']
|
93 |
-
except (TypeError, KeyError):
|
94 |
-
journal_name = None
|
95 |
|
96 |
# title: knock out escape sequences
|
97 |
-
title
|
|
|
|
|
|
|
98 |
|
99 |
-
# abstract: knock out escape sequences, then truncate to
|
100 |
-
|
101 |
-
|
102 |
-
abstract
|
|
|
|
|
|
|
103 |
|
104 |
-
# authors:
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
110 |
|
111 |
-
|
112 |
entry_string = ''
|
113 |
|
114 |
-
if doi:
|
115 |
-
entry_string += f'## [{title}]({doi})\n\n'
|
116 |
else:
|
117 |
entry_string += f'## {title}\n\n'
|
118 |
|
119 |
-
if journal_name:
|
120 |
-
entry_string += f'**{authors_str} - {journal_name}, {
|
121 |
else:
|
122 |
-
entry_string += f'**{authors_str}, {
|
123 |
|
124 |
entry_string += f'{abstract}\n\n'
|
125 |
|
126 |
-
|
127 |
-
|
|
|
128 |
entry_string += ' '
|
129 |
|
130 |
-
if doi:
|
131 |
-
entry_string += f'*DOI: {doi.replace("https://doi.org/", "")}*'
|
132 |
entry_string += ' '
|
133 |
|
134 |
entry_string += f'*Similarity: {distance:.2f}*'
|
|
|
6 |
import faiss
|
7 |
import gradio as gr
|
8 |
from datasets import Dataset
|
9 |
+
from typing import TypedDict, Self, Any
|
10 |
import json
|
11 |
from pathlib import Path
|
12 |
+
from dataclasses import dataclass
|
13 |
|
14 |
from markdown_it import MarkdownIt # used for overriding default markdown renderer
|
15 |
|
|
|
24 |
param_string: str # pass directly to faiss index
|
25 |
|
26 |
|
27 |
+
@dataclass
|
28 |
+
class Work:
|
29 |
+
title: str | None
|
30 |
+
abstract: str | None # recovered from abstract_inverted_index
|
31 |
+
authors: list[str] # takes raw_author_name field from Authorship objects
|
32 |
+
journal_name: str | None # takes the display_name field of the first location
|
33 |
+
year: int
|
34 |
+
citations: int
|
35 |
+
doi: str | None
|
36 |
+
|
37 |
+
def __post_init__(self):
|
38 |
+
self._check_type(self.title, str, nullable=True)
|
39 |
+
self._check_type(self.abstract, str, nullable=True)
|
40 |
+
self._check_type(self.authors, list)
|
41 |
+
for author in self.authors:
|
42 |
+
self._check_type(author, str)
|
43 |
+
self._check_type(self.journal_name, str, nullable=True)
|
44 |
+
self._check_type(self.year, int)
|
45 |
+
self._check_type(self.citations, int)
|
46 |
+
self._check_type(self.doi, str, nullable=True)
|
47 |
+
|
48 |
+
@classmethod
|
49 |
+
def from_dict(cls, d: dict) -> Self:
|
50 |
+
try:
|
51 |
+
inverted_index: dict[str, list[int]] = d["abstract_inverted_index"]
|
52 |
+
except KeyError:
|
53 |
+
abstract = None
|
54 |
+
else:
|
55 |
+
abstract = cls._recover_abstract(inverted_index)
|
56 |
+
|
57 |
+
try:
|
58 |
+
journal_name = d['primary_location']['source']['display_name']
|
59 |
+
except (TypeError, KeyError): # key didn't exist or a value was null
|
60 |
+
journal_name = None
|
61 |
+
|
62 |
+
return cls(
|
63 |
+
title=d["title"],
|
64 |
+
abstract=abstract,
|
65 |
+
authors=[authorship["raw_author_name"] for authorship in d["authorships"]],
|
66 |
+
journal_name=journal_name,
|
67 |
+
year=d["publication_year"],
|
68 |
+
citations=d["cited_by_count"],
|
69 |
+
doi=d["doi"],
|
70 |
+
)
|
71 |
+
|
72 |
+
@staticmethod
|
73 |
+
def raw_fields() -> list[str]:
|
74 |
+
return [
|
75 |
+
"title",
|
76 |
+
"abstract_inverted_index",
|
77 |
+
"authorships",
|
78 |
+
"primary_location",
|
79 |
+
"publication_year",
|
80 |
+
"cited_by_count",
|
81 |
+
"doi"
|
82 |
+
]
|
83 |
+
|
84 |
+
@staticmethod
|
85 |
+
def _check_type(v: Any, t: type, nullable: bool = False):
|
86 |
+
if not ((nullable and v is None) or isinstance(v, t)):
|
87 |
+
v_type_name = f"{type(v)}" if v is not None else "None"
|
88 |
+
t_name = f"{t}"
|
89 |
+
if nullable:
|
90 |
+
t_name += " | None"
|
91 |
+
raise ValueError(f"expected {t_name}, got {v_type_name}")
|
92 |
+
|
93 |
+
@staticmethod
|
94 |
+
def _recover_abstract(inverted_index: dict[str, list[int]]) -> str:
|
95 |
+
abstract_size = max(max(locs) for locs in inverted_index.values())+1
|
96 |
+
|
97 |
+
abstract_words: list[str | None] = [None] * abstract_size
|
98 |
+
for word, locs in inverted_index.items():
|
99 |
+
for loc in locs:
|
100 |
+
abstract_words[loc] = word
|
101 |
+
|
102 |
+
return ' '.join(word for word in abstract_words if word is not None)
|
103 |
+
|
104 |
+
|
105 |
def get_model(model_name: str, device: str) -> SentenceTransformer:
|
106 |
return SentenceTransformer(model_name, device=device)
|
107 |
|
|
|
124 |
return index
|
125 |
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
def execute_request(request_str):
|
128 |
response = requests.get(request_str).json()
|
129 |
return response
|
|
|
134 |
|
135 |
result_string = ''
|
136 |
for distance, openalex_id in neighbors:
|
137 |
+
work = Work.from_dict(response[openalex_id])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
+
# edge cases: no title
|
140 |
+
abstract = work.abstract if work.abstract is not None else "No abstract"
|
|
|
|
|
|
|
141 |
|
142 |
# title: knock out escape sequences
|
143 |
+
if work.title is not None:
|
144 |
+
title = work.title.replace('\n', '\\n').replace('\r', '\\r')
|
145 |
+
else: # edge case: no title
|
146 |
+
title = "No title"
|
147 |
|
148 |
+
# abstract: knock out escape sequences, then truncate to 2000 chars
|
149 |
+
if work.abstract:
|
150 |
+
abstract = work.abstract.replace('\n', '\\n').replace('\r', '\\r')
|
151 |
+
if len(abstract) > 2000:
|
152 |
+
abstract = abstract[:2000] + '...'
|
153 |
+
else: # edge case: no abstract
|
154 |
+
abstract = "No abstract"
|
155 |
|
156 |
+
# authors: truncate to 3 authors if necessary
|
157 |
+
if len(work.authors) >= 3:
|
158 |
+
authors_str = ', '.join(work.authors[:3]) + ', ...'
|
159 |
+
elif work.authors:
|
160 |
+
authors_str = ', '.join(work.authors)
|
161 |
+
else: # edge case: no authors
|
162 |
+
authors_str = "No author"
|
163 |
|
|
|
164 |
entry_string = ''
|
165 |
|
166 |
+
if work.doi: # edge case: for now, no doi -> no link
|
167 |
+
entry_string += f'## [{title}]({work.doi})\n\n'
|
168 |
else:
|
169 |
entry_string += f'## {title}\n\n'
|
170 |
|
171 |
+
if work.journal_name:
|
172 |
+
entry_string += f'**{authors_str} - {work.journal_name}, {work.year}**\n\n'
|
173 |
else:
|
174 |
+
entry_string += f'**{authors_str}, {work.year}**\n\n'
|
175 |
|
176 |
entry_string += f'{abstract}\n\n'
|
177 |
|
178 |
+
# edge case: we shouldn't tack "Cited-by count: 0" onto someone's paper
|
179 |
+
if work.citations:
|
180 |
+
entry_string += f'*Cited-by count: {work.citations}*'
|
181 |
entry_string += ' '
|
182 |
|
183 |
+
if work.doi: # list the doi if it exists
|
184 |
+
entry_string += f'*DOI: {work.doi.replace("https://doi.org/", "")}*'
|
185 |
entry_string += ' '
|
186 |
|
187 |
entry_string += f'*Similarity: {distance:.2f}*'
|