Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph

Running

File size: 6,561 Bytes

ee9fa1c

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c49abf54-35c7-4b82-aa31-a155633c3327",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43644952-bca3-4060-af76-3d5a8357be06",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import bs4\n",
    "import requests\n",
    "import spacy\n",
    "from spacy import displacy\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "\n",
    "from spacy.matcher import Matcher \n",
    "from spacy.tokens import Span \n",
    "\n",
    "import networkx as nx\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "\n",
    "pd.set_option('display.max_colwidth', 200)\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b73f085-2b8b-4f48-b26c-2da5fb22c9f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import wikipedia sentences\n",
    "candidate_sentences = pd.read_csv(\"../input/wiki-sentences1/wiki_sentences_v2.csv\")\n",
    "candidate_sentences.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bd9de52-e1bc-46a6-9f52-e90969ed9f0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_entities(sent):\n",
    "  ## chunk 1\n",
    "  ent1 = \"\"\n",
    "  ent2 = \"\"\n",
    "\n",
    "  prv_tok_dep = \"\"    # dependency tag of previous token in the sentence\n",
    "  prv_tok_text = \"\"   # previous token in the sentence\n",
    "\n",
    "  prefix = \"\"\n",
    "  modifier = \"\"\n",
    "\n",
    "  #############################################################\n",
    "  \n",
    "  for tok in nlp(sent):\n",
    "    ## chunk 2\n",
    "    # if token is a punctuation mark then move on to the next token\n",
    "    if tok.dep_ != \"punct\":\n",
    "      # check: token is a compound word or not\n",
    "      if tok.dep_ == \"compound\":\n",
    "        prefix = tok.text\n",
    "        # if the previous word was also a 'compound' then add the current word to it\n",
    "        if prv_tok_dep == \"compound\":\n",
    "          prefix = prv_tok_text + \" \"+ tok.text\n",
    "      \n",
    "      # check: token is a modifier or not\n",
    "      if tok.dep_.endswith(\"mod\") == True:\n",
    "        modifier = tok.text\n",
    "        # if the previous word was also a 'compound' then add the current word to it\n",
    "        if prv_tok_dep == \"compound\":\n",
    "          modifier = prv_tok_text + \" \"+ tok.text\n",
    "      \n",
    "      ## chunk 3\n",
    "      if tok.dep_.find(\"subj\") == True:\n",
    "        ent1 = modifier +\" \"+ prefix + \" \"+ tok.text\n",
    "        prefix = \"\"\n",
    "        modifier = \"\"\n",
    "        prv_tok_dep = \"\"\n",
    "        prv_tok_text = \"\"      \n",
    "\n",
    "      ## chunk 4\n",
    "      if tok.dep_.find(\"obj\") == True:\n",
    "        ent2 = modifier +\" \"+ prefix +\" \"+ tok.text\n",
    "        \n",
    "      ## chunk 5  \n",
    "      # update variables\n",
    "      prv_tok_dep = tok.dep_\n",
    "      prv_tok_text = tok.text\n",
    "  #############################################################\n",
    "\n",
    "  return [ent1.strip(), ent2.strip()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11bec388-fdb8-4823-9049-aa4cf328eba6",
   "metadata": {},
   "outputs": [],
   "source": [
    "entity_pairs = []\n",
    "\n",
    "for i in tqdm(candidate_sentences[\"sentence\"]):\n",
    "  entity_pairs.append(get_entities(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02f56072-ae65-4b15-a3b6-674701040568",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_relation(sent):\n",
    "\n",
    "  doc = nlp(sent)\n",
    "\n",
    "  # Matcher class object \n",
    "  matcher = Matcher(nlp.vocab)\n",
    "\n",
    "  #define the pattern \n",
    "  pattern = [{'DEP':'ROOT'}, \n",
    "            {'DEP':'prep','OP':\"?\"},\n",
    "            {'DEP':'agent','OP':\"?\"},  \n",
    "            {'POS':'ADJ','OP':\"?\"}] \n",
    "\n",
    "  matcher.add(\"matching_1\", None, pattern) \n",
    "\n",
    "  matches = matcher(doc)\n",
    "  k = len(matches) - 1\n",
    "\n",
    "  span = doc[matches[k][1]:matches[k][2]] \n",
    "\n",
    "  return(span.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee3a774f-9f2d-4a4c-a77a-04bc420d4864",
   "metadata": {},
   "outputs": [],
   "source": [
    "relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c04581bb-46b5-48ce-bbe1-b465a789ad82",
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract subject\n",
    "source = [i[0] for i in entity_pairs]\n",
    "\n",
    "# extract object\n",
    "target = [i[1] for i in entity_pairs]\n",
    "\n",
    "kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0fec1f2-d370-4d79-8a92-2ebdff2be420",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a directed-graph from a dataframe\n",
    "G=nx.from_pandas_edgelist(kg_df, \"source\", \"target\", \n",
    "                          edge_attr=True, create_using=nx.MultiDiGraph())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39b80dbe-f991-4e12-b0a1-4026344af82f",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(12,12))\n",
    "\n",
    "pos = nx.spring_layout(G)\n",
    "nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be07f563-0b61-441f-bb24-a9e884eef1b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://www.kaggle.com/code/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}