Spaces:

atrytone
/

NBDT-Recommendation-Engine

Runtime error

App Files Files Community

atrytone commited on Jul 9, 2023

Commit

9f6b5cc

0 Parent(s):

Duplicate from biodatlab/NBDT-Recommendation-Engine

Browse files

Files changed (8) hide show

.gitattributes +36 -0
Build_VecStore.ipynb +282 -0
NBDT_Data_Recs.ipynb +0 -0
README.md +17 -0
app.py +156 -0
nbdt_index/index.faiss +3 -0
nbdt_index/index.pkl +3 -0
requirements.txt +7 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+index.faiss filter=lfs diff=lfs merge=lfs -text

Build_VecStore.ipynb ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QS0v2bceN4Or"
+      },
+      "source": [
+        "Builds a database of vector embeddings from list of abstracts"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "l5RwcIG8OAjX"
+      },
+      "source": [
+        "## Some Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sfwT5YW2JCnu"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install transformers==4.28.0\n",
+        "!pip install -U sentence-transformers\n",
+        "!pip install datasets\n",
+        "!pip install langchain\n",
+        "!pip install torch\n",
+        "!pip install faiss-cpu"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "psoTvOp4VkBE"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import shutil\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from tqdm.auto import tqdm\n",
+        "import torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "arZiN8QRHS_a"
+      },
+      "outputs": [],
+      "source": [
+        "import locale\n",
+        "locale.getpreferredencoding = lambda: \"UTF-8\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JwWs0-Uu6ohg"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import AutoTokenizer, BertForSequenceClassification\n",
+        "\n",
+        "m_tokenizer = AutoTokenizer.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
+        "m_model = BertForSequenceClassification.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
+        "miread_bundle = (m_tokenizer,m_model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BR-adEUUz9su"
+      },
+      "outputs": [],
+      "source": [
+        "def create_lbert_embed(sents,bundle):\n",
+        "  tokenizer = bundle[0]\n",
+        "  model = bundle[1]\n",
+        "  model.cuda()\n",
+        "  tokens = tokenizer(sents,padding=True,truncation=True,return_tensors='pt')\n",
+        "  device = torch.device('cuda')\n",
+        "  tokens = tokens.to(device)\n",
+        "  with torch.no_grad():\n",
+        "    embeds = model(**tokens, output_hidden_states=True,return_dict=True).pooler_output\n",
+        "  return embeds.cpu()\n",
+        "\n",
+        "def create_miread_embed(sents,bundle):\n",
+        "  tokenizer = bundle[0]\n",
+        "  model = bundle[1]\n",
+        "  model.cuda()\n",
+        "  tokens = tokenizer(sents,\n",
+        "                   max_length=512,\n",
+        "                   padding=True,\n",
+        "                   truncation=True,\n",
+        "                   return_tensors=\"pt\"\n",
+        "                  )\n",
+        "  device = torch.device('cuda')\n",
+        "  tokens = tokens.to(device)\n",
+        "  with torch.no_grad():\n",
+        "    out = model.bert(**tokens)\n",
+        "    feature = out.last_hidden_state[:, 0, :]\n",
+        "  return feature.cpu()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-wHpHmD3zNSR"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.vectorstores import FAISS\n",
+        "from langchain.embeddings import HuggingFaceEmbeddings\n",
+        "\n",
+        "model_name = \"biodatlab/MIReAD-Neuro-Large\"\n",
+        "model_kwargs = {'device': 'cuda'}\n",
+        "encode_kwargs = {'normalize_embeddings': False}\n",
+        "faiss_embedder = HuggingFaceEmbeddings(\n",
+        "    model_name=model_name,\n",
+        "    model_kwargs=model_kwargs,\n",
+        "    encode_kwargs=encode_kwargs\n",
+        ")\n",
+        "\n",
+        "def add_to_db(data,create_embed,bundle,name=''):\n",
+        "  batch_size = 128\n",
+        "  \"\"\"\n",
+        "  data : list of rows with an 'abstract' and an 'identifier' field\n",
+        "  index : pinecone Index object\n",
+        "  create_embed : function that creates the embedding given an abstract\n",
+        "  \"\"\"\n",
+        "  res = []\n",
+        "  vecdb = None\n",
+        "  for i in tqdm(range(0, len(data), batch_size)):\n",
+        "      # find end of batch\n",
+        "      i_end = min(i+batch_size, len(data))\n",
+        "      # create IDs batch\n",
+        "      ids = [name + '-' + str(x) for x in range(i, i_end)]\n",
+        "      # create metadata batch\n",
+        "      metadatas = [{\n",
+        "                    'journal':row.get('journal','None'),\n",
+        "                    'title':row['title'],\n",
+        "                    'abstract': row['abstract'],\n",
+        "                    'authors':row.get('authors','None'),\n",
+        "                    'link':row.get('link','None'),\n",
+        "                    'date':row.get('date','None'),\n",
+        "                    'submitter':row.get('submitter','None'),\n",
+        "                    } for row in data[i:i_end]]\n",
+        "      # create embeddings\n",
+        "      em = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]\n",
+        "      texts = [row['abstract'] for row in data[i:i_end]]\n",
+        "      records = list(zip(texts, em))\n",
+        "      if vecdb:\n",
+        "        vecdb_batch = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
+        "        vecdb.merge_from(vecdb_batch)\n",
+        "      else:\n",
+        "        vecdb = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
+        "  return vecdb"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PfsK3DE4MMou"
+      },
+      "outputs": [],
+      "source": [
+        "nbdt_data = pd.read_json('data_final.json')\n",
+        "aliases = pd.read_csv('id_list.csv')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JrGJh5XgNPvU"
+      },
+      "outputs": [],
+      "source": [
+        "aliases = aliases.drop_duplicates('Full Name')\n",
+        "aliases.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CShYwGwWMZh5"
+      },
+      "outputs": [],
+      "source": [
+        "nbdt_data.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SziJtbggMuyn"
+      },
+      "outputs": [],
+      "source": [
+        "def load_nbdt(data,aliases):\n",
+        "  nbdt_records = []\n",
+        "  urls = []\n",
+        "  no_abst_count = 0\n",
+        "  no_journal_count = 0\n",
+        "  for row in aliases.itertuples():\n",
+        "    name = row[1]\n",
+        "    auth_ids = eval(row[2])\n",
+        "    auth_ids = [int(x) for x in auth_ids]\n",
+        "    papers = nbdt_data.loc[nbdt_data['authorId'].isin(auth_ids)]['papers']\n",
+        "    all_papers = []\n",
+        "    for paper_set in papers:\n",
+        "      all_papers.extend(paper_set)\n",
+        "    for paper in all_papers:\n",
+        "      url = paper['url']\n",
+        "      title = paper['title']\n",
+        "      abst = paper['abstract']\n",
+        "      year = paper['year']\n",
+        "      journal = paper.get('journal')\n",
+        "      if journal:\n",
+        "        journal = journal.get('name')\n",
+        "      else:\n",
+        "        journal = 'None'\n",
+        "        no_journal_count += 1\n",
+        "      authors = [name]\n",
+        "      if not(abst):\n",
+        "        abst = ''\n",
+        "        no_abst_count += 1\n",
+        "      record = {'journal':journal,'title':title,'abstract':abst,'link':url,'date':year,'authors':authors,'submitter':'None'}\n",
+        "      if url not in urls:\n",
+        "        nbdt_records.append(record)\n",
+        "        urls.append(url)\n",
+        "  return nbdt_records, (no_abst_count,no_journal_count)\n",
+        "nbdt_recs, no_counts = load_nbdt(nbdt_data,aliases)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IovTlDINc2Ds"
+      },
+      "outputs": [],
+      "source": [
+        "nbdt_db = add_to_db(nbdt_recs,create_miread_embed,miread_bundle,'nbdt')\n",
+        "nbdt_db.save_local(\"nbdt_index\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

NBDT_Data_Recs.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: NBDT Reviewer Recommendation System
+emoji: 📊
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 3.35.2
+app_file: app.py
+pinned: false
+models:
+- biodatlab/MIReAD-Neuro
+duplicated_from: biodatlab/NBDT-Recommendation-Engine
+---
+This space is a demo for a Reviewer Recommendation System for the Neurons, Behavior, Data Analysis and Theory Journal.
+The index being used here includes papers from a variety of authors who have published in the NBDT Journal across various years.
+The embedding model in use here is [biodatlab/MIReAD-Neuro-Large](https://huggingface.co/biodatlab/MIReAD-Neuro-Large).

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import gradio as gr
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+import torch
+def create_miread_embed(sents, bundle):
+    tokenizer = bundle[0]
+    model = bundle[1]
+    model.cpu()
+    tokens = tokenizer(sents,
+                       max_length=512,
+                       padding=True,
+                       truncation=True,
+                       return_tensors="pt"
+                       )
+    device = torch.device('cpu')
+    tokens = tokens.to(device)
+    with torch.no_grad():
+        out = model.bert(**tokens)
+        feature = out.last_hidden_state[:, 0, :]
+    return feature.cpu()
+def get_matches(query):
+    matches = vecdb.similarity_search_with_score(query, k=60)
+    return matches
+def inference(query):
+    matches = get_matches(query)
+    auth_counts = {}
+    j_bucket = {}
+    n_table = []
+    a_table = []
+    scores = [round(match[1].item(), 3) for match in matches]
+    min_score = min(scores)
+    max_score = max(scores)
+    def normaliser(x): return round(1 - (x-min_score)/max_score, 3)
+    for i, match in enumerate(matches):
+        doc = match[0]
+        score = round(normaliser(round(match[1].item(), 3)), 3)
+        title = doc.metadata['title']
+        author = doc.metadata['authors'][0].title()
+        date = doc.metadata.get('date', 'None')
+        link = doc.metadata.get('link', 'None')
+        submitter = doc.metadata.get('submitter', 'None')
+        # journal = doc.metadata.get('journal', 'None').strip()
+        journal = doc.metadata['journal']
+        if (journal is None or journal.strip() == ''):
+            journal = 'None'
+        else:
+            journal = journal.strip()
+        # For journals
+        if journal not in j_bucket:
+            j_bucket[journal] = score
+        else:
+            j_bucket[journal] += score
+        # For authors
+        record = [i+1,
+                  score,
+                  author,
+                  title,
+                  link,
+                  date]
+        if auth_counts.get(author, 0) < 2:
+            n_table.append(record)
+            if auth_counts.get(author, 0) == 0:
+                auth_counts[author] = 1
+            else:
+                auth_counts[author] += 1
+        # For abstracts
+        record = [i+1,
+                  title,
+                  author,
+                  submitter,
+                  journal,
+                  date,
+                  link,
+                  score
+                  ]
+        a_table.append(record)
+    del j_bucket['None']
+    j_table = sorted([[journal, round(score, 3)] for journal,
+                     score in j_bucket.items()],
+                     key=lambda x: x[1], reverse=True)
+    j_table = [[i+1, item[0], item[1]] for i, item in enumerate(j_table)]
+    j_output = gr.Dataframe.update(value=j_table, visible=True)
+    n_output = gr.Dataframe.update(value=n_table, visible=True)
+    a_output = gr.Dataframe.update(value=a_table, visible=True)
+    return [a_output, j_output, n_output]
+model_name = "biodatlab/MIReAD-Neuro-Large"
+model_kwargs = {'device': 'cpu'}
+encode_kwargs = {'normalize_embeddings': False}
+faiss_embedder = HuggingFaceEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+vecdb = FAISS.load_local("nbdt_index", faiss_embedder)
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# NBDT Recommendation Engine for Editors")
+    gr.Markdown("NBDT Recommendation Engine for Editors is a tool for neuroscience authors/abstracts/journalsrecommendation built for NBDT journal editors. \
+    It aims to help an editor to find similar reviewers, abstracts, and journals to a given submitted abstract.\
+    To find a recommendation, paste a `title[SEP]abstract` or `abstract` in the text box below and click \"Find Matches\".\
+    Then, you can hover to authors/abstracts/journals tab to find a suggested list.\
+    The data in our current demo includes authors associated with the NBDT Journal. We will update the data monthly for an up-to-date publications.")
+    abst = gr.Textbox(label="Abstract", lines=10)
+    action_btn = gr.Button(value="Find Matches")
+    with gr.Tab("Authors"):
+        n_output = gr.Dataframe(
+            headers=['No.', 'Score', 'Name', 'Title', 'Link', 'Date'],
+            datatype=['number', 'number', 'str', 'str', 'str', 'str'],
+            col_count=(6, "fixed"),
+            wrap=True,
+            visible=False
+        )
+    with gr.Tab("Abstracts"):
+        a_output = gr.Dataframe(
+            headers=['No.', 'Title', 'Author', 'Corresponding Author',
+                     'Journal', 'Date', 'Link', 'Score'],
+            datatype=['number', 'str', 'str', 'str',
+                      'str', 'str', 'str', 'number'],
+            col_count=(8, "fixed"),
+            wrap=True,
+            visible=False
+        )
+    with gr.Tab("Journals"):
+        j_output = gr.Dataframe(
+            headers=['No.', 'Name', 'Score'],
+            datatype=['number', 'str', 'number'],
+            col_count=(3, "fixed"),
+            wrap=True,
+            visible=False
+        )
+    action_btn.click(fn=inference,
+                     inputs=[
+                         abst,
+                     ],
+                     outputs=[a_output, j_output, n_output],
+                     api_name="neurojane")
+demo.launch(debug=True)

nbdt_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e618b6304914de46395f6dc334e33e6c4023f5210c76d088fa0128a7fc04b4c
+size 108625965

nbdt_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:013b06aa858e6e44ecf550bc2e7a0c0b0d77404ff995dc2e96051df6e29355fb
+size 35224532

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+sentence-transformers
+torch
+datasets
+sentencepiece
+langchain
+faiss-cpu
+accelerate