Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph

Running

Chris Finlayson

initial commit

ee9fa1c over 1 year ago

5.38 kB

	import gradio as gr
	import os
	import fitz
	import re
	import spacy
	import spacy.cli
	import re
	import pandas as pd
	import bs4
	import requests
	import spacy
	from spacy import displacy
	nlp = spacy.load('en_core_web_sm')
	from spacy.matcher import Matcher
	from spacy.tokens import Span
	import networkx as nx
	import matplotlib.pyplot as plt
	from tqdm import tqdm

	try:
	nlp = spacy.load('en_core_web_sm')
	except OSError:
	print("Model not found. Downloading...")
	spacy.cli.download("en_core_web_sm")
	nlp = spacy.load('en_core_web_sm')


	# def read_pdf(file):
	# doc = fitz.open(file)
	# text = ""
	# for page in doc:
	# text += page.get_text("text").split('\n')
	# return text

	def read_csv(file):
	candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")
	return candidate_sentences.shape

	def get_entities(sent):
	## chunk 1
	ent1 = ""
	ent2 = ""

	prv_tok_dep = "" # dependency tag of previous token in the sentence
	prv_tok_text = "" # previous token in the sentence

	prefix = ""
	modifier = ""

	#############################################################

	for tok in nlp(sent):
	## chunk 2
	# if token is a punctuation mark then move on to the next token
	if tok.dep_ != "punct":
	# check: token is a compound word or not
	if tok.dep_ == "compound":
	prefix = tok.text
	# if the previous word was also a 'compound' then add the current word to it
	if prv_tok_dep == "compound":
	prefix = prv_tok_text + " "+ tok.text

	# check: token is a modifier or not
	if tok.dep_.endswith("mod") == True:
	modifier = tok.text
	# if the previous word was also a 'compound' then add the current word to it
	if prv_tok_dep == "compound":
	modifier = prv_tok_text + " "+ tok.text

	## chunk 3
	if tok.dep_.find("subj") == True:
	ent1 = modifier +" "+ prefix + " "+ tok.text
	prefix = ""
	modifier = ""
	prv_tok_dep = ""
	prv_tok_text = ""

	## chunk 4
	if tok.dep_.find("obj") == True:
	ent2 = modifier +" "+ prefix +" "+ tok.text

	## chunk 5
	# update variables
	prv_tok_dep = tok.dep_
	prv_tok_text = tok.text
	#############################################################

	return [ent1.strip(), ent2.strip()]

	def get_relation(sent):

	doc = nlp(sent)

	# Matcher class object
	matcher = Matcher(nlp.vocab)

	#define the pattern
	pattern = [{'DEP':'ROOT'},
	{'DEP':'prep','OP':"?"},
	{'DEP':'agent','OP':"?"},
	{'POS':'ADJ','OP':"?"}]

	matcher.add("matching_1", [pattern])

	matches = matcher(doc)
	k = len(matches) - 1

	span = doc[matches[k][1]:matches[k][2]]

	return(span.text)

	def ulify(elements):
	string = "<ul>\n"
	string += "\n".join(["<li>" + str(s) + "</li>" for s in elements])
	string += "\n</ul>"
	return string

	def execute_process(file, edge):
	# candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences'])
	candidate_sentences = pd.read_csv(file)

	entity_pairs = []
	for i in tqdm(candidate_sentences["sentence"]):
	entity_pairs.append(get_entities(i))
	relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]
	# extract subject
	source = [i[0] for i in entity_pairs]

	# extract object
	target = [i[1] for i in entity_pairs]
	kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

	# create a variable of all unique edges
	unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None
	# create a dataframe of all unique edges and their counts
	edge_counts = kg_df['edge'].value_counts()
	unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})

	G=nx.from_pandas_edgelist(kg_df, "source", "target",
	edge_attr=True, create_using=nx.MultiDiGraph())

	if edge is not None:
	G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
	edge_attr=True, create_using=nx.MultiDiGraph())
	plt.figure(figsize=(12,12))
	pos = nx.spring_layout(G)
	nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
	plt.savefig("graph.png")
	# return "graph.png", "\n".join(unique_edges)
	return "graph.png", unique_edges_df

	else:
	plt.figure(figsize=(12,12))
	pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
	nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
	plt.savefig("graph.png")
	# return "graph.png", "\n".join(unique_edges)
	return "graph.png", unique_edges_df

	inputs = [
	gr.File(label="Upload PDF"),
	gr.Textbox(label="Graph a particular edge", type="text")
	]

	outputs = [
	gr.Image(label="Generated graph"),
	gr.Dataframe(label="Unique edges", type="pandas")
	]

	description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within'
	iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)
	iface.launch()