|
import gradio as gr |
|
import os |
|
import fitz |
|
import re |
|
import spacy |
|
import spacy.cli |
|
import re |
|
import pandas as pd |
|
import bs4 |
|
import requests |
|
import spacy |
|
from spacy import displacy |
|
nlp = spacy.load('en_core_web_sm') |
|
from spacy.matcher import Matcher |
|
from spacy.tokens import Span |
|
import networkx as nx |
|
import matplotlib.pyplot as plt |
|
from tqdm import tqdm |
|
|
|
try: |
|
nlp = spacy.load('en_core_web_sm') |
|
except OSError: |
|
print("Model not found. Downloading...") |
|
spacy.cli.download("en_core_web_sm") |
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_csv(file): |
|
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") |
|
return candidate_sentences.shape |
|
|
|
def get_entities(sent): |
|
|
|
ent1 = "" |
|
ent2 = "" |
|
|
|
prv_tok_dep = "" |
|
prv_tok_text = "" |
|
|
|
prefix = "" |
|
modifier = "" |
|
|
|
|
|
|
|
for tok in nlp(sent): |
|
|
|
|
|
if tok.dep_ != "punct": |
|
|
|
if tok.dep_ == "compound": |
|
prefix = tok.text |
|
|
|
if prv_tok_dep == "compound": |
|
prefix = prv_tok_text + " "+ tok.text |
|
|
|
|
|
if tok.dep_.endswith("mod") == True: |
|
modifier = tok.text |
|
|
|
if prv_tok_dep == "compound": |
|
modifier = prv_tok_text + " "+ tok.text |
|
|
|
|
|
if tok.dep_.find("subj") == True: |
|
ent1 = modifier +" "+ prefix + " "+ tok.text |
|
prefix = "" |
|
modifier = "" |
|
prv_tok_dep = "" |
|
prv_tok_text = "" |
|
|
|
|
|
if tok.dep_.find("obj") == True: |
|
ent2 = modifier +" "+ prefix +" "+ tok.text |
|
|
|
|
|
|
|
prv_tok_dep = tok.dep_ |
|
prv_tok_text = tok.text |
|
|
|
|
|
return [ent1.strip(), ent2.strip()] |
|
|
|
def get_relation(sent): |
|
|
|
doc = nlp(sent) |
|
|
|
|
|
matcher = Matcher(nlp.vocab) |
|
|
|
|
|
pattern = [{'DEP':'ROOT'}, |
|
{'DEP':'prep','OP':"?"}, |
|
{'DEP':'agent','OP':"?"}, |
|
{'POS':'ADJ','OP':"?"}] |
|
|
|
matcher.add("matching_1", [pattern]) |
|
|
|
matches = matcher(doc) |
|
k = len(matches) - 1 |
|
|
|
span = doc[matches[k][1]:matches[k][2]] |
|
|
|
return(span.text) |
|
|
|
def ulify(elements): |
|
string = "<ul>\n" |
|
string += "\n".join(["<li>" + str(s) + "</li>" for s in elements]) |
|
string += "\n</ul>" |
|
return string |
|
|
|
def execute_process(file, edge): |
|
|
|
candidate_sentences = pd.read_csv(file) |
|
|
|
entity_pairs = [] |
|
for i in tqdm(candidate_sentences["sentence"]): |
|
entity_pairs.append(get_entities(i)) |
|
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] |
|
|
|
source = [i[0] for i in entity_pairs] |
|
|
|
|
|
target = [i[1] for i in entity_pairs] |
|
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) |
|
|
|
|
|
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None |
|
|
|
edge_counts = kg_df['edge'].value_counts() |
|
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) |
|
|
|
G=nx.from_pandas_edgelist(kg_df, "source", "target", |
|
edge_attr=True, create_using=nx.MultiDiGraph()) |
|
|
|
if edge is not None: |
|
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", |
|
edge_attr=True, create_using=nx.MultiDiGraph()) |
|
plt.figure(figsize=(12,12)) |
|
pos = nx.spring_layout(G) |
|
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos) |
|
plt.savefig("graph.png") |
|
|
|
return "graph.png", unique_edges_df |
|
|
|
else: |
|
plt.figure(figsize=(12,12)) |
|
pos = nx.spring_layout(G, k = 0.5) |
|
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) |
|
plt.savefig("graph.png") |
|
|
|
return "graph.png", unique_edges_df |
|
|
|
inputs = [ |
|
gr.File(label="Upload PDF"), |
|
gr.Textbox(label="Graph a particular edge", type="text") |
|
] |
|
|
|
outputs = [ |
|
gr.Image(label="Generated graph"), |
|
gr.Dataframe(label="Unique edges", type="pandas") |
|
] |
|
|
|
description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within' |
|
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description) |
|
iface.launch() |
|
|