Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph-pyvis

Sleeping

foundry-pdf-knowledge-graph-pyvis / app.py

Chris Finlayson

Update to note

7b07841 over 1 year ago

9.88 kB

	import gradio as gr # Importing gradio for creating web interface
	import os # Importing os for operating system related tasks
	import fitz # Importing fitz for PDF processing
	import re # Importing re for regular expressions
	import spacy # Importing spacy for natural language processing
	import spacy.cli # Importing spacy's command line interface
	import pandas as pd # Importing pandas for data manipulation
	import bs4 # Importing bs4 for web scraping
	import requests # Importing requests for making HTTP requests
	from spacy import displacy # Importing displacy from spacy for visualizing NLP results
	from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
	from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
	import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
	import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
	from tqdm import tqdm # Importing tqdm for progress bars

	try:
	nlp = spacy.load('en_core_web_sm') # Trying to load the English model
	except OSError:
	print("Model not found. Downloading...") # If model not found, print a message
	spacy.cli.download("en_core_web_sm") # Download the English model
	nlp = spacy.load('en_core_web_sm') # Load the English model

	def read_pdf(file): # Define a function to read a PDF file
	doc = fitz.open(file) # Open the PDF file
	text = [] # Initialize an empty list to store the text
	for page in doc: # For each page in the document
	for sentence in page.get_text("text").split('\n'): # For each sentence in the page
	if len(sentence) > 0: # If the sentence is not empty
	text.append(sentence) # Append the sentence to the list
	return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences

	def read_csv(file): # Define a function to read a CSV file
	candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
	return candidate_sentences # Return the DataFrame

	def get_entities(sent): # Define a function to get entities from a sentence
	ent1 = "" # Initialize an empty string for the first entity
	ent2 = "" # Initialize an empty string for the second entity

	prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
	prv_tok_text = "" # Initialize an empty string for the previous token

	prefix = "" # Initialize an empty string for the prefix
	modifier = "" # Initialize an empty string for the modifier

	for tok in nlp(sent): # For each token in the sentence
	if tok.dep_ != "punct": # If the token is not a punctuation mark
	if tok.dep_ == "compound": # If the token is a compound word
	prefix = tok.text # Set the prefix to the token text
	if prv_tok_dep == "compound": # If the previous token was also a compound word
	prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix

	if tok.dep_.endswith("mod") == True: # If the token is a modifier
	modifier = tok.text # Set the modifier to the token text
	if prv_tok_dep == "compound": # If the previous token was also a compound word
	modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier

	if tok.dep_.find("subj") == True: # If the token is a subject
	ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
	prefix = "" # Reset the prefix
	modifier = "" # Reset the modifier
	prv_tok_dep = "" # Reset the dependency tag of the previous token
	prv_tok_text = "" # Reset the previous token

	if tok.dep_.find("obj") == True: # If the token is an object
	ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text

	prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
	prv_tok_text = tok.text # Update the previous token

	return [ent1.strip(), ent2.strip()] # Return the entities

	def get_relation(sent): # Define a function to get the relation from a sentence
	doc = nlp(sent) # Process the sentence
	matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
	pattern = [{'DEP':'ROOT'},
	{'DEP':'prep','OP':"?"},
	{'DEP':'agent','OP':"?"},
	{'POS':'ADJ','OP':"?"}] # Define a pattern
	matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
	matches = matcher(doc) # Match the pattern in the document
	if matches: # If there are matches
	k = len(matches) - 1 # Get the index of the last match
	span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
	return span.text # Return the text of the span
	else:
	return "" # If there are no matches, return an empty string

	def render_graph(G):
	from pyvis.network import Network
	graph_output_directory = "./docs/index.html"
	net = Network(
	notebook=False,
	# bgcolor="#1a1a1a",
	cdn_resources="remote",
	height="900px",
	width="100%",
	select_menu=True,
	# font_color="#cccccc",
	filter_menu=False,
	)
	net.from_nx(G)
	# net.repulsion(node_distance=150, spring_length=400)
	net.force_atlas_2based(central_gravity=0.015, gravity=-31)
	# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
	net.show_buttons(filter_=["physics"])

	net.show(graph_output_directory)

	with open(graph_output_directory, 'r') as file:
	html_content = file.read()

	html_content = html_content.replace("'", "\"")

	iframe = f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; geolocation; microphone; camera;
	display-capture; encrypted-media;" sandbox="allow-modals allow-forms
	allow-scripts allow-same-origin allow-popups
	allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
	allowpaymentrequest="" frameborder="0" srcdoc='{html_content}'></iframe>"""

	return iframe

	def execute_process(file, edge): # Define a function to execute the process
	candidate_sentences = read_pdf(file) # Read the PDF file

	if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']: # Only execute if kg_df is not defined or if the file is not consistent with the persisted global
	entity_pairs = [] # Initialize an empty list for the entity pairs
	for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
	entity_pairs.append(get_entities(i)) # Append the entities to the list
	relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence

	source = [i[0] for i in entity_pairs] # Extract the subjects
	target = [i[1] for i in entity_pairs] # Extract the objects
	globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
	globals()['file'] = file # Persist the file into a global variable

	unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
	edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
	unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts

	if len(edge)==0: # If no edge is specified
	G=nx.from_pandas_edgelist(kg_df, "source", "target",
	edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
	# plt.figure(figsize=(12,12)) # Create a figure
	# pos = nx.spring_layout(G) # Get the positions of the nodes
	# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
	# plt.savefig("graph.png") # Save the graph as a PNG
	iframe = render_graph(G)
	return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
	else: # If an edge is specified
	G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
	edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
	# plt.figure(figsize=(12,12)) # Create a figure
	# pos = nx.spring_layout(G) # Get the positions of the nodes
	iframe = render_graph(G)

	# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
	# plt.savefig("graph.png") # Save the graph as a PNG
	return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges

	inputs = [
	gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
	gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
	]

	outputs = [
	gr.HTML(label="Generated graph"), # Create an image output for the generated graph
	gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
	]

	description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.'
	iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description) # Create an interface
	iface.launch() # Launch the interface