import gradio as gr import os import fitz import re import spacy import spacy.cli import re import pandas as pd import bs4 import requests import spacy from spacy import displacy nlp = spacy.load('en_core_web_sm') from spacy.matcher import Matcher from spacy.tokens import Span import networkx as nx import matplotlib.pyplot as plt from tqdm import tqdm try: nlp = spacy.load('en_core_web_sm') except OSError: print("Model not found. Downloading...") spacy.cli.download("en_core_web_sm") nlp = spacy.load('en_core_web_sm') # def read_pdf(file): # doc = fitz.open(file) # text = "" # for page in doc: # text += page.get_text("text").split('\n') # return text def read_csv(file): candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") return candidate_sentences.shape def get_entities(sent): ## chunk 1 ent1 = "" ent2 = "" prv_tok_dep = "" # dependency tag of previous token in the sentence prv_tok_text = "" # previous token in the sentence prefix = "" modifier = "" ############################################################# for tok in nlp(sent): ## chunk 2 # if token is a punctuation mark then move on to the next token if tok.dep_ != "punct": # check: token is a compound word or not if tok.dep_ == "compound": prefix = tok.text # if the previous word was also a 'compound' then add the current word to it if prv_tok_dep == "compound": prefix = prv_tok_text + " "+ tok.text # check: token is a modifier or not if tok.dep_.endswith("mod") == True: modifier = tok.text # if the previous word was also a 'compound' then add the current word to it if prv_tok_dep == "compound": modifier = prv_tok_text + " "+ tok.text ## chunk 3 if tok.dep_.find("subj") == True: ent1 = modifier +" "+ prefix + " "+ tok.text prefix = "" modifier = "" prv_tok_dep = "" prv_tok_text = "" ## chunk 4 if tok.dep_.find("obj") == True: ent2 = modifier +" "+ prefix +" "+ tok.text ## chunk 5 # update variables prv_tok_dep = tok.dep_ prv_tok_text = tok.text ############################################################# return [ent1.strip(), ent2.strip()] def get_relation(sent): doc = nlp(sent) # Matcher class object matcher = Matcher(nlp.vocab) #define the pattern pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"}, {'DEP':'agent','OP':"?"}, {'POS':'ADJ','OP':"?"}] matcher.add("matching_1", [pattern]) matches = matcher(doc) k = len(matches) - 1 span = doc[matches[k][1]:matches[k][2]] return(span.text) def ulify(elements): string = "" return string def execute_process(file, edge): # candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences']) candidate_sentences = pd.read_csv(file) entity_pairs = [] for i in tqdm(candidate_sentences["sentence"]): entity_pairs.append(get_entities(i)) relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # extract subject source = [i[0] for i in entity_pairs] # extract object target = [i[1] for i in entity_pairs] kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # create a variable of all unique edges unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # create a dataframe of all unique edges and their counts edge_counts = kg_df['edge'].value_counts() unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) G=nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph()) if edge is not None: G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph()) plt.figure(figsize=(12,12)) pos = nx.spring_layout(G) nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos) plt.savefig("graph.png") # return "graph.png", "\n".join(unique_edges) return "graph.png", unique_edges_df else: plt.figure(figsize=(12,12)) pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) plt.savefig("graph.png") # return "graph.png", "\n".join(unique_edges) return "graph.png", unique_edges_df inputs = [ gr.File(label="Upload PDF"), gr.Textbox(label="Graph a particular edge", type="text") ] outputs = [ gr.Image(label="Generated graph"), gr.Dataframe(label="Unique edges", type="pandas") ] description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within' iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description) iface.launch()