atrytone commited on
Commit
9f6b5cc
·
0 Parent(s):

Duplicate from biodatlab/NBDT-Recommendation-Engine

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ index.faiss filter=lfs diff=lfs merge=lfs -text
Build_VecStore.ipynb ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "QS0v2bceN4Or"
7
+ },
8
+ "source": [
9
+ "Builds a database of vector embeddings from list of abstracts"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "l5RwcIG8OAjX"
16
+ },
17
+ "source": [
18
+ "## Some Setup"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {
25
+ "id": "sfwT5YW2JCnu"
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "!pip install transformers==4.28.0\n",
30
+ "!pip install -U sentence-transformers\n",
31
+ "!pip install datasets\n",
32
+ "!pip install langchain\n",
33
+ "!pip install torch\n",
34
+ "!pip install faiss-cpu"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {
41
+ "id": "psoTvOp4VkBE"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "import shutil\n",
47
+ "\n",
48
+ "import numpy as np\n",
49
+ "import pandas as pd\n",
50
+ "from tqdm.auto import tqdm\n",
51
+ "import torch"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {
58
+ "id": "arZiN8QRHS_a"
59
+ },
60
+ "outputs": [],
61
+ "source": [
62
+ "import locale\n",
63
+ "locale.getpreferredencoding = lambda: \"UTF-8\""
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {
70
+ "id": "JwWs0-Uu6ohg"
71
+ },
72
+ "outputs": [],
73
+ "source": [
74
+ "from transformers import AutoTokenizer, BertForSequenceClassification\n",
75
+ "\n",
76
+ "m_tokenizer = AutoTokenizer.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
77
+ "m_model = BertForSequenceClassification.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
78
+ "miread_bundle = (m_tokenizer,m_model)"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {
85
+ "id": "BR-adEUUz9su"
86
+ },
87
+ "outputs": [],
88
+ "source": [
89
+ "def create_lbert_embed(sents,bundle):\n",
90
+ " tokenizer = bundle[0]\n",
91
+ " model = bundle[1]\n",
92
+ " model.cuda()\n",
93
+ " tokens = tokenizer(sents,padding=True,truncation=True,return_tensors='pt')\n",
94
+ " device = torch.device('cuda')\n",
95
+ " tokens = tokens.to(device)\n",
96
+ " with torch.no_grad():\n",
97
+ " embeds = model(**tokens, output_hidden_states=True,return_dict=True).pooler_output\n",
98
+ " return embeds.cpu()\n",
99
+ "\n",
100
+ "def create_miread_embed(sents,bundle):\n",
101
+ " tokenizer = bundle[0]\n",
102
+ " model = bundle[1]\n",
103
+ " model.cuda()\n",
104
+ " tokens = tokenizer(sents,\n",
105
+ " max_length=512,\n",
106
+ " padding=True,\n",
107
+ " truncation=True,\n",
108
+ " return_tensors=\"pt\"\n",
109
+ " )\n",
110
+ " device = torch.device('cuda')\n",
111
+ " tokens = tokens.to(device)\n",
112
+ " with torch.no_grad():\n",
113
+ " out = model.bert(**tokens)\n",
114
+ " feature = out.last_hidden_state[:, 0, :]\n",
115
+ " return feature.cpu()"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": null,
121
+ "metadata": {
122
+ "id": "-wHpHmD3zNSR"
123
+ },
124
+ "outputs": [],
125
+ "source": [
126
+ "from langchain.vectorstores import FAISS\n",
127
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
128
+ "\n",
129
+ "model_name = \"biodatlab/MIReAD-Neuro-Large\"\n",
130
+ "model_kwargs = {'device': 'cuda'}\n",
131
+ "encode_kwargs = {'normalize_embeddings': False}\n",
132
+ "faiss_embedder = HuggingFaceEmbeddings(\n",
133
+ " model_name=model_name,\n",
134
+ " model_kwargs=model_kwargs,\n",
135
+ " encode_kwargs=encode_kwargs\n",
136
+ ")\n",
137
+ "\n",
138
+ "def add_to_db(data,create_embed,bundle,name=''):\n",
139
+ " batch_size = 128\n",
140
+ " \"\"\"\n",
141
+ " data : list of rows with an 'abstract' and an 'identifier' field\n",
142
+ " index : pinecone Index object\n",
143
+ " create_embed : function that creates the embedding given an abstract\n",
144
+ " \"\"\"\n",
145
+ " res = []\n",
146
+ " vecdb = None\n",
147
+ " for i in tqdm(range(0, len(data), batch_size)):\n",
148
+ " # find end of batch\n",
149
+ " i_end = min(i+batch_size, len(data))\n",
150
+ " # create IDs batch\n",
151
+ " ids = [name + '-' + str(x) for x in range(i, i_end)]\n",
152
+ " # create metadata batch\n",
153
+ " metadatas = [{\n",
154
+ " 'journal':row.get('journal','None'),\n",
155
+ " 'title':row['title'],\n",
156
+ " 'abstract': row['abstract'],\n",
157
+ " 'authors':row.get('authors','None'),\n",
158
+ " 'link':row.get('link','None'),\n",
159
+ " 'date':row.get('date','None'),\n",
160
+ " 'submitter':row.get('submitter','None'),\n",
161
+ " } for row in data[i:i_end]]\n",
162
+ " # create embeddings\n",
163
+ " em = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]\n",
164
+ " texts = [row['abstract'] for row in data[i:i_end]]\n",
165
+ " records = list(zip(texts, em))\n",
166
+ " if vecdb:\n",
167
+ " vecdb_batch = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
168
+ " vecdb.merge_from(vecdb_batch)\n",
169
+ " else:\n",
170
+ " vecdb = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
171
+ " return vecdb"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {
178
+ "id": "PfsK3DE4MMou"
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "nbdt_data = pd.read_json('data_final.json')\n",
183
+ "aliases = pd.read_csv('id_list.csv')"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "metadata": {
190
+ "id": "JrGJh5XgNPvU"
191
+ },
192
+ "outputs": [],
193
+ "source": [
194
+ "aliases = aliases.drop_duplicates('Full Name')\n",
195
+ "aliases.head()"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {
202
+ "id": "CShYwGwWMZh5"
203
+ },
204
+ "outputs": [],
205
+ "source": [
206
+ "nbdt_data.head()"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "metadata": {
213
+ "id": "SziJtbggMuyn"
214
+ },
215
+ "outputs": [],
216
+ "source": [
217
+ "def load_nbdt(data,aliases):\n",
218
+ " nbdt_records = []\n",
219
+ " urls = []\n",
220
+ " no_abst_count = 0\n",
221
+ " no_journal_count = 0\n",
222
+ " for row in aliases.itertuples():\n",
223
+ " name = row[1]\n",
224
+ " auth_ids = eval(row[2])\n",
225
+ " auth_ids = [int(x) for x in auth_ids]\n",
226
+ " papers = nbdt_data.loc[nbdt_data['authorId'].isin(auth_ids)]['papers']\n",
227
+ " all_papers = []\n",
228
+ " for paper_set in papers:\n",
229
+ " all_papers.extend(paper_set)\n",
230
+ " for paper in all_papers:\n",
231
+ " url = paper['url']\n",
232
+ " title = paper['title']\n",
233
+ " abst = paper['abstract']\n",
234
+ " year = paper['year']\n",
235
+ " journal = paper.get('journal')\n",
236
+ " if journal:\n",
237
+ " journal = journal.get('name')\n",
238
+ " else:\n",
239
+ " journal = 'None'\n",
240
+ " no_journal_count += 1\n",
241
+ " authors = [name]\n",
242
+ " if not(abst):\n",
243
+ " abst = ''\n",
244
+ " no_abst_count += 1\n",
245
+ " record = {'journal':journal,'title':title,'abstract':abst,'link':url,'date':year,'authors':authors,'submitter':'None'}\n",
246
+ " if url not in urls:\n",
247
+ " nbdt_records.append(record)\n",
248
+ " urls.append(url)\n",
249
+ " return nbdt_records, (no_abst_count,no_journal_count)\n",
250
+ "nbdt_recs, no_counts = load_nbdt(nbdt_data,aliases)"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {
257
+ "id": "IovTlDINc2Ds"
258
+ },
259
+ "outputs": [],
260
+ "source": [
261
+ "nbdt_db = add_to_db(nbdt_recs,create_miread_embed,miread_bundle,'nbdt')\n",
262
+ "nbdt_db.save_local(\"nbdt_index\")"
263
+ ]
264
+ }
265
+ ],
266
+ "metadata": {
267
+ "accelerator": "GPU",
268
+ "colab": {
269
+ "gpuType": "T4",
270
+ "provenance": []
271
+ },
272
+ "kernelspec": {
273
+ "display_name": "Python 3",
274
+ "name": "python3"
275
+ },
276
+ "language_info": {
277
+ "name": "python"
278
+ }
279
+ },
280
+ "nbformat": 4,
281
+ "nbformat_minor": 0
282
+ }
NBDT_Data_Recs.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NBDT Reviewer Recommendation System
3
+ emoji: 📊
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ models:
11
+ - biodatlab/MIReAD-Neuro
12
+ duplicated_from: biodatlab/NBDT-Recommendation-Engine
13
+ ---
14
+
15
+ This space is a demo for a Reviewer Recommendation System for the Neurons, Behavior, Data Analysis and Theory Journal.
16
+ The index being used here includes papers from a variety of authors who have published in the NBDT Journal across various years.
17
+ The embedding model in use here is [biodatlab/MIReAD-Neuro-Large](https://huggingface.co/biodatlab/MIReAD-Neuro-Large).
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ import torch
5
+
6
+
7
+ def create_miread_embed(sents, bundle):
8
+ tokenizer = bundle[0]
9
+ model = bundle[1]
10
+ model.cpu()
11
+ tokens = tokenizer(sents,
12
+ max_length=512,
13
+ padding=True,
14
+ truncation=True,
15
+ return_tensors="pt"
16
+ )
17
+ device = torch.device('cpu')
18
+ tokens = tokens.to(device)
19
+ with torch.no_grad():
20
+ out = model.bert(**tokens)
21
+ feature = out.last_hidden_state[:, 0, :]
22
+ return feature.cpu()
23
+
24
+
25
+ def get_matches(query):
26
+ matches = vecdb.similarity_search_with_score(query, k=60)
27
+ return matches
28
+
29
+
30
+ def inference(query):
31
+ matches = get_matches(query)
32
+ auth_counts = {}
33
+ j_bucket = {}
34
+ n_table = []
35
+ a_table = []
36
+ scores = [round(match[1].item(), 3) for match in matches]
37
+ min_score = min(scores)
38
+ max_score = max(scores)
39
+ def normaliser(x): return round(1 - (x-min_score)/max_score, 3)
40
+ for i, match in enumerate(matches):
41
+ doc = match[0]
42
+ score = round(normaliser(round(match[1].item(), 3)), 3)
43
+ title = doc.metadata['title']
44
+ author = doc.metadata['authors'][0].title()
45
+ date = doc.metadata.get('date', 'None')
46
+ link = doc.metadata.get('link', 'None')
47
+ submitter = doc.metadata.get('submitter', 'None')
48
+ # journal = doc.metadata.get('journal', 'None').strip()
49
+ journal = doc.metadata['journal']
50
+ if (journal is None or journal.strip() == ''):
51
+ journal = 'None'
52
+ else:
53
+ journal = journal.strip()
54
+ # For journals
55
+ if journal not in j_bucket:
56
+ j_bucket[journal] = score
57
+ else:
58
+ j_bucket[journal] += score
59
+
60
+ # For authors
61
+ record = [i+1,
62
+ score,
63
+ author,
64
+ title,
65
+ link,
66
+ date]
67
+ if auth_counts.get(author, 0) < 2:
68
+ n_table.append(record)
69
+ if auth_counts.get(author, 0) == 0:
70
+ auth_counts[author] = 1
71
+ else:
72
+ auth_counts[author] += 1
73
+
74
+ # For abstracts
75
+ record = [i+1,
76
+ title,
77
+ author,
78
+ submitter,
79
+ journal,
80
+ date,
81
+ link,
82
+ score
83
+ ]
84
+ a_table.append(record)
85
+
86
+ del j_bucket['None']
87
+ j_table = sorted([[journal, round(score, 3)] for journal,
88
+ score in j_bucket.items()],
89
+ key=lambda x: x[1], reverse=True)
90
+ j_table = [[i+1, item[0], item[1]] for i, item in enumerate(j_table)]
91
+ j_output = gr.Dataframe.update(value=j_table, visible=True)
92
+ n_output = gr.Dataframe.update(value=n_table, visible=True)
93
+ a_output = gr.Dataframe.update(value=a_table, visible=True)
94
+
95
+ return [a_output, j_output, n_output]
96
+
97
+
98
+ model_name = "biodatlab/MIReAD-Neuro-Large"
99
+ model_kwargs = {'device': 'cpu'}
100
+ encode_kwargs = {'normalize_embeddings': False}
101
+ faiss_embedder = HuggingFaceEmbeddings(
102
+ model_name=model_name,
103
+ model_kwargs=model_kwargs,
104
+ encode_kwargs=encode_kwargs
105
+ )
106
+
107
+ vecdb = FAISS.load_local("nbdt_index", faiss_embedder)
108
+
109
+
110
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
111
+ gr.Markdown("# NBDT Recommendation Engine for Editors")
112
+ gr.Markdown("NBDT Recommendation Engine for Editors is a tool for neuroscience authors/abstracts/journalsrecommendation built for NBDT journal editors. \
113
+ It aims to help an editor to find similar reviewers, abstracts, and journals to a given submitted abstract.\
114
+ To find a recommendation, paste a `title[SEP]abstract` or `abstract` in the text box below and click \"Find Matches\".\
115
+ Then, you can hover to authors/abstracts/journals tab to find a suggested list.\
116
+ The data in our current demo includes authors associated with the NBDT Journal. We will update the data monthly for an up-to-date publications.")
117
+
118
+ abst = gr.Textbox(label="Abstract", lines=10)
119
+
120
+ action_btn = gr.Button(value="Find Matches")
121
+
122
+ with gr.Tab("Authors"):
123
+ n_output = gr.Dataframe(
124
+ headers=['No.', 'Score', 'Name', 'Title', 'Link', 'Date'],
125
+ datatype=['number', 'number', 'str', 'str', 'str', 'str'],
126
+ col_count=(6, "fixed"),
127
+ wrap=True,
128
+ visible=False
129
+ )
130
+ with gr.Tab("Abstracts"):
131
+ a_output = gr.Dataframe(
132
+ headers=['No.', 'Title', 'Author', 'Corresponding Author',
133
+ 'Journal', 'Date', 'Link', 'Score'],
134
+ datatype=['number', 'str', 'str', 'str',
135
+ 'str', 'str', 'str', 'number'],
136
+ col_count=(8, "fixed"),
137
+ wrap=True,
138
+ visible=False
139
+ )
140
+ with gr.Tab("Journals"):
141
+ j_output = gr.Dataframe(
142
+ headers=['No.', 'Name', 'Score'],
143
+ datatype=['number', 'str', 'number'],
144
+ col_count=(3, "fixed"),
145
+ wrap=True,
146
+ visible=False
147
+ )
148
+
149
+ action_btn.click(fn=inference,
150
+ inputs=[
151
+ abst,
152
+ ],
153
+ outputs=[a_output, j_output, n_output],
154
+ api_name="neurojane")
155
+
156
+ demo.launch(debug=True)
nbdt_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e618b6304914de46395f6dc334e33e6c4023f5210c76d088fa0128a7fc04b4c
3
+ size 108625965
nbdt_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013b06aa858e6e44ecf550bc2e7a0c0b0d77404ff995dc2e96051df6e29355fb
3
+ size 35224532
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ torch
3
+ datasets
4
+ sentencepiece
5
+ langchain
6
+ faiss-cpu
7
+ accelerate