{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "c49abf54-35c7-4b82-aa31-a155633c3327", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "43644952-bca3-4060-af76-3d5a8357be06", "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "import bs4\n", "import requests\n", "import spacy\n", "from spacy import displacy\n", "nlp = spacy.load('en_core_web_sm')\n", "\n", "from spacy.matcher import Matcher \n", "from spacy.tokens import Span \n", "\n", "import networkx as nx\n", "\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm\n", "\n", "pd.set_option('display.max_colwidth', 200)\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "id": "1b73f085-2b8b-4f48-b26c-2da5fb22c9f2", "metadata": {}, "outputs": [], "source": [ "# import wikipedia sentences\n", "candidate_sentences = pd.read_csv(\"../input/wiki-sentences1/wiki_sentences_v2.csv\")\n", "candidate_sentences.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "1bd9de52-e1bc-46a6-9f52-e90969ed9f0c", "metadata": {}, "outputs": [], "source": [ "def get_entities(sent):\n", " ## chunk 1\n", " ent1 = \"\"\n", " ent2 = \"\"\n", "\n", " prv_tok_dep = \"\" # dependency tag of previous token in the sentence\n", " prv_tok_text = \"\" # previous token in the sentence\n", "\n", " prefix = \"\"\n", " modifier = \"\"\n", "\n", " #############################################################\n", " \n", " for tok in nlp(sent):\n", " ## chunk 2\n", " # if token is a punctuation mark then move on to the next token\n", " if tok.dep_ != \"punct\":\n", " # check: token is a compound word or not\n", " if tok.dep_ == \"compound\":\n", " prefix = tok.text\n", " # if the previous word was also a 'compound' then add the current word to it\n", " if prv_tok_dep == \"compound\":\n", " prefix = prv_tok_text + \" \"+ tok.text\n", " \n", " # check: token is a modifier or not\n", " if tok.dep_.endswith(\"mod\") == True:\n", " modifier = tok.text\n", " # if the previous word was also a 'compound' then add the current word to it\n", " if prv_tok_dep == \"compound\":\n", " modifier = prv_tok_text + \" \"+ tok.text\n", " \n", " ## chunk 3\n", " if tok.dep_.find(\"subj\") == True:\n", " ent1 = modifier +\" \"+ prefix + \" \"+ tok.text\n", " prefix = \"\"\n", " modifier = \"\"\n", " prv_tok_dep = \"\"\n", " prv_tok_text = \"\" \n", "\n", " ## chunk 4\n", " if tok.dep_.find(\"obj\") == True:\n", " ent2 = modifier +\" \"+ prefix +\" \"+ tok.text\n", " \n", " ## chunk 5 \n", " # update variables\n", " prv_tok_dep = tok.dep_\n", " prv_tok_text = tok.text\n", " #############################################################\n", "\n", " return [ent1.strip(), ent2.strip()]" ] }, { "cell_type": "code", "execution_count": null, "id": "11bec388-fdb8-4823-9049-aa4cf328eba6", "metadata": {}, "outputs": [], "source": [ "entity_pairs = []\n", "\n", "for i in tqdm(candidate_sentences[\"sentence\"]):\n", " entity_pairs.append(get_entities(i))" ] }, { "cell_type": "code", "execution_count": null, "id": "02f56072-ae65-4b15-a3b6-674701040568", "metadata": {}, "outputs": [], "source": [ "def get_relation(sent):\n", "\n", " doc = nlp(sent)\n", "\n", " # Matcher class object \n", " matcher = Matcher(nlp.vocab)\n", "\n", " #define the pattern \n", " pattern = [{'DEP':'ROOT'}, \n", " {'DEP':'prep','OP':\"?\"},\n", " {'DEP':'agent','OP':\"?\"}, \n", " {'POS':'ADJ','OP':\"?\"}] \n", "\n", " matcher.add(\"matching_1\", None, pattern) \n", "\n", " matches = matcher(doc)\n", " k = len(matches) - 1\n", "\n", " span = doc[matches[k][1]:matches[k][2]] \n", "\n", " return(span.text)" ] }, { "cell_type": "code", "execution_count": null, "id": "ee3a774f-9f2d-4a4c-a77a-04bc420d4864", "metadata": {}, "outputs": [], "source": [ "relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]" ] }, { "cell_type": "code", "execution_count": null, "id": "c04581bb-46b5-48ce-bbe1-b465a789ad82", "metadata": {}, "outputs": [], "source": [ "# extract subject\n", "source = [i[0] for i in entity_pairs]\n", "\n", "# extract object\n", "target = [i[1] for i in entity_pairs]\n", "\n", "kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})" ] }, { "cell_type": "code", "execution_count": null, "id": "b0fec1f2-d370-4d79-8a92-2ebdff2be420", "metadata": {}, "outputs": [], "source": [ "# create a directed-graph from a dataframe\n", "G=nx.from_pandas_edgelist(kg_df, \"source\", \"target\", \n", " edge_attr=True, create_using=nx.MultiDiGraph())" ] }, { "cell_type": "code", "execution_count": null, "id": "39b80dbe-f991-4e12-b0a1-4026344af82f", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,12))\n", "\n", "pos = nx.spring_layout(G)\n", "nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "be07f563-0b61-441f-bb24-a9e884eef1b8", "metadata": {}, "outputs": [], "source": [ "#https://www.kaggle.com/code/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }