colonelwatch commited on
Commit
9cee2a5
·
1 Parent(s): f5e5f81

Roll parsing into Work dataclass classmethod

Browse files
Files changed (1) hide show
  1. app.py +111 -58
app.py CHANGED
@@ -6,9 +6,10 @@ from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import gradio as gr
8
  from datasets import Dataset
9
- from typing import TypedDict
10
  import json
11
  from pathlib import Path
 
12
 
13
  from markdown_it import MarkdownIt # used for overriding default markdown renderer
14
 
@@ -23,6 +24,84 @@ class IndexParameters(TypedDict):
23
  param_string: str # pass directly to faiss index
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def get_model(model_name: str, device: str) -> SentenceTransformer:
27
  return SentenceTransformer(model_name, device=device)
28
 
@@ -45,17 +124,6 @@ def get_index(dir: Path, search_time_s: float) -> Dataset:
45
  return index
46
 
47
 
48
- def recover_abstract(inverted_index: dict[str, list[int]]) -> str:
49
- abstract_size = max([max(appearances) for appearances in inverted_index.values()])+1
50
-
51
- abstract_words: list[str | None] = [None] * abstract_size
52
- for word, appearances in inverted_index.items():
53
- for appearance in appearances:
54
- abstract_words[appearance] = word
55
-
56
- return ' '.join(word for word in abstract_words if word is not None)
57
-
58
-
59
  def execute_request(request_str):
60
  response = requests.get(request_str).json()
61
  return response
@@ -66,69 +134,54 @@ def format_response(neighbors, response):
66
 
67
  result_string = ''
68
  for distance, openalex_id in neighbors:
69
- doc = response[openalex_id]
70
-
71
- # collect attributes from openalex doc for the given openalex_id
72
- title = doc['title']
73
- # abstract = _recover_abstract(doc['abstract_inverted_index'])
74
- abstract_inverted_index = doc['abstract_inverted_index']
75
- author_names = [authorship['author']['display_name'] for authorship in doc['authorships']]
76
- # journal_name = doc['primary_location']['source']['display_name']
77
- publication_year = doc['publication_year']
78
- citation_count = doc['cited_by_count']
79
- doi = doc['doi']
80
-
81
- if title is None: # edge case: no title
82
- title = 'No title'
83
-
84
- if abstract_inverted_index is None: # edge case: no abstract
85
- abstract = 'No abstract'
86
- else:
87
- abstract = recover_abstract(abstract_inverted_index)
88
- abstract = abstract.replace('\n', '\\n').replace('\r', '\\r')
89
 
90
- # try to get journal name or else set it to None
91
- try:
92
- journal_name = doc['primary_location']['source']['display_name']
93
- except (TypeError, KeyError):
94
- journal_name = None
95
 
96
  # title: knock out escape sequences
97
- title = title.replace('\n', '\\n').replace('\r', '\\r')
 
 
 
98
 
99
- # abstract: knock out escape sequences, then truncate to 1500 characters if necessary
100
- abstract = abstract.replace('\n', '\\n').replace('\r', '\\r')
101
- if len(abstract) > 2000:
102
- abstract = abstract[:2000] + '...'
 
 
 
103
 
104
- # authors: cover no name edge case, truncate to 3 authors if necessary
105
- author_names = [author_name if author_name else 'No name' for author_name in author_names]
106
- if len(author_names) >= 3:
107
- authors_str = ', '.join(author_names[:3]) + ', ...'
108
- else:
109
- authors_str = ', '.join(author_names)
 
110
 
111
-
112
  entry_string = ''
113
 
114
- if doi: # edge case: for now, no doi -> no link
115
- entry_string += f'## [{title}]({doi})\n\n'
116
  else:
117
  entry_string += f'## {title}\n\n'
118
 
119
- if journal_name:
120
- entry_string += f'**{authors_str} - {journal_name}, {publication_year}**\n\n'
121
  else:
122
- entry_string += f'**{authors_str}, {publication_year}**\n\n'
123
 
124
  entry_string += f'{abstract}\n\n'
125
 
126
- if citation_count: # edge case: we shouldn't tack "Cited-by count: 0" onto someone's paper
127
- entry_string += f'*Cited-by count: {citation_count}*'
 
128
  entry_string += '    '
129
 
130
- if doi: # list the doi if it exists
131
- entry_string += f'*DOI: {doi.replace("https://doi.org/", "")}*'
132
  entry_string += '    '
133
 
134
  entry_string += f'*Similarity: {distance:.2f}*'
 
6
  import faiss
7
  import gradio as gr
8
  from datasets import Dataset
9
+ from typing import TypedDict, Self, Any
10
  import json
11
  from pathlib import Path
12
+ from dataclasses import dataclass
13
 
14
  from markdown_it import MarkdownIt # used for overriding default markdown renderer
15
 
 
24
  param_string: str # pass directly to faiss index
25
 
26
 
27
+ @dataclass
28
+ class Work:
29
+ title: str | None
30
+ abstract: str | None # recovered from abstract_inverted_index
31
+ authors: list[str] # takes raw_author_name field from Authorship objects
32
+ journal_name: str | None # takes the display_name field of the first location
33
+ year: int
34
+ citations: int
35
+ doi: str | None
36
+
37
+ def __post_init__(self):
38
+ self._check_type(self.title, str, nullable=True)
39
+ self._check_type(self.abstract, str, nullable=True)
40
+ self._check_type(self.authors, list)
41
+ for author in self.authors:
42
+ self._check_type(author, str)
43
+ self._check_type(self.journal_name, str, nullable=True)
44
+ self._check_type(self.year, int)
45
+ self._check_type(self.citations, int)
46
+ self._check_type(self.doi, str, nullable=True)
47
+
48
+ @classmethod
49
+ def from_dict(cls, d: dict) -> Self:
50
+ try:
51
+ inverted_index: dict[str, list[int]] = d["abstract_inverted_index"]
52
+ except KeyError:
53
+ abstract = None
54
+ else:
55
+ abstract = cls._recover_abstract(inverted_index)
56
+
57
+ try:
58
+ journal_name = d['primary_location']['source']['display_name']
59
+ except (TypeError, KeyError): # key didn't exist or a value was null
60
+ journal_name = None
61
+
62
+ return cls(
63
+ title=d["title"],
64
+ abstract=abstract,
65
+ authors=[authorship["raw_author_name"] for authorship in d["authorships"]],
66
+ journal_name=journal_name,
67
+ year=d["publication_year"],
68
+ citations=d["cited_by_count"],
69
+ doi=d["doi"],
70
+ )
71
+
72
+ @staticmethod
73
+ def raw_fields() -> list[str]:
74
+ return [
75
+ "title",
76
+ "abstract_inverted_index",
77
+ "authorships",
78
+ "primary_location",
79
+ "publication_year",
80
+ "cited_by_count",
81
+ "doi"
82
+ ]
83
+
84
+ @staticmethod
85
+ def _check_type(v: Any, t: type, nullable: bool = False):
86
+ if not ((nullable and v is None) or isinstance(v, t)):
87
+ v_type_name = f"{type(v)}" if v is not None else "None"
88
+ t_name = f"{t}"
89
+ if nullable:
90
+ t_name += " | None"
91
+ raise ValueError(f"expected {t_name}, got {v_type_name}")
92
+
93
+ @staticmethod
94
+ def _recover_abstract(inverted_index: dict[str, list[int]]) -> str:
95
+ abstract_size = max(max(locs) for locs in inverted_index.values())+1
96
+
97
+ abstract_words: list[str | None] = [None] * abstract_size
98
+ for word, locs in inverted_index.items():
99
+ for loc in locs:
100
+ abstract_words[loc] = word
101
+
102
+ return ' '.join(word for word in abstract_words if word is not None)
103
+
104
+
105
  def get_model(model_name: str, device: str) -> SentenceTransformer:
106
  return SentenceTransformer(model_name, device=device)
107
 
 
124
  return index
125
 
126
 
 
 
 
 
 
 
 
 
 
 
 
127
  def execute_request(request_str):
128
  response = requests.get(request_str).json()
129
  return response
 
134
 
135
  result_string = ''
136
  for distance, openalex_id in neighbors:
137
+ work = Work.from_dict(response[openalex_id])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ # edge cases: no title
140
+ abstract = work.abstract if work.abstract is not None else "No abstract"
 
 
 
141
 
142
  # title: knock out escape sequences
143
+ if work.title is not None:
144
+ title = work.title.replace('\n', '\\n').replace('\r', '\\r')
145
+ else: # edge case: no title
146
+ title = "No title"
147
 
148
+ # abstract: knock out escape sequences, then truncate to 2000 chars
149
+ if work.abstract:
150
+ abstract = work.abstract.replace('\n', '\\n').replace('\r', '\\r')
151
+ if len(abstract) > 2000:
152
+ abstract = abstract[:2000] + '...'
153
+ else: # edge case: no abstract
154
+ abstract = "No abstract"
155
 
156
+ # authors: truncate to 3 authors if necessary
157
+ if len(work.authors) >= 3:
158
+ authors_str = ', '.join(work.authors[:3]) + ', ...'
159
+ elif work.authors:
160
+ authors_str = ', '.join(work.authors)
161
+ else: # edge case: no authors
162
+ authors_str = "No author"
163
 
 
164
  entry_string = ''
165
 
166
+ if work.doi: # edge case: for now, no doi -> no link
167
+ entry_string += f'## [{title}]({work.doi})\n\n'
168
  else:
169
  entry_string += f'## {title}\n\n'
170
 
171
+ if work.journal_name:
172
+ entry_string += f'**{authors_str} - {work.journal_name}, {work.year}**\n\n'
173
  else:
174
+ entry_string += f'**{authors_str}, {work.year}**\n\n'
175
 
176
  entry_string += f'{abstract}\n\n'
177
 
178
+ # edge case: we shouldn't tack "Cited-by count: 0" onto someone's paper
179
+ if work.citations:
180
+ entry_string += f'*Cited-by count: {work.citations}*'
181
  entry_string += '    '
182
 
183
+ if work.doi: # list the doi if it exists
184
+ entry_string += f'*DOI: {work.doi.replace("https://doi.org/", "")}*'
185
  entry_string += '    '
186
 
187
  entry_string += f'*Similarity: {distance:.2f}*'