Chris Finlayson
Update to note
7b07841
import gradio as gr # Importing gradio for creating web interface
import os # Importing os for operating system related tasks
import fitz # Importing fitz for PDF processing
import re # Importing re for regular expressions
import spacy # Importing spacy for natural language processing
import spacy.cli # Importing spacy's command line interface
import pandas as pd # Importing pandas for data manipulation
import bs4 # Importing bs4 for web scraping
import requests # Importing requests for making HTTP requests
from spacy import displacy # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm # Importing tqdm for progress bars
try:
nlp = spacy.load('en_core_web_sm') # Trying to load the English model
except OSError:
print("Model not found. Downloading...") # If model not found, print a message
spacy.cli.download("en_core_web_sm") # Download the English model
nlp = spacy.load('en_core_web_sm') # Load the English model
def read_pdf(file): # Define a function to read a PDF file
doc = fitz.open(file) # Open the PDF file
text = [] # Initialize an empty list to store the text
for page in doc: # For each page in the document
for sentence in page.get_text("text").split('\n'): # For each sentence in the page
if len(sentence) > 0: # If the sentence is not empty
text.append(sentence) # Append the sentence to the list
return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences
def read_csv(file): # Define a function to read a CSV file
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
return candidate_sentences # Return the DataFrame
def get_entities(sent): # Define a function to get entities from a sentence
ent1 = "" # Initialize an empty string for the first entity
ent2 = "" # Initialize an empty string for the second entity
prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
prv_tok_text = "" # Initialize an empty string for the previous token
prefix = "" # Initialize an empty string for the prefix
modifier = "" # Initialize an empty string for the modifier
for tok in nlp(sent): # For each token in the sentence
if tok.dep_ != "punct": # If the token is not a punctuation mark
if tok.dep_ == "compound": # If the token is a compound word
prefix = tok.text # Set the prefix to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix
if tok.dep_.endswith("mod") == True: # If the token is a modifier
modifier = tok.text # Set the modifier to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier
if tok.dep_.find("subj") == True: # If the token is a subject
ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
prefix = "" # Reset the prefix
modifier = "" # Reset the modifier
prv_tok_dep = "" # Reset the dependency tag of the previous token
prv_tok_text = "" # Reset the previous token
if tok.dep_.find("obj") == True: # If the token is an object
ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text
prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
prv_tok_text = tok.text # Update the previous token
return [ent1.strip(), ent2.strip()] # Return the entities
def get_relation(sent): # Define a function to get the relation from a sentence
doc = nlp(sent) # Process the sentence
matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}] # Define a pattern
matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
matches = matcher(doc) # Match the pattern in the document
if matches: # If there are matches
k = len(matches) - 1 # Get the index of the last match
span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
return span.text # Return the text of the span
else:
return "" # If there are no matches, return an empty string
def render_graph(G):
from pyvis.network import Network
graph_output_directory = "./docs/index.html"
net = Network(
notebook=False,
# bgcolor="#1a1a1a",
cdn_resources="remote",
height="900px",
width="100%",
select_menu=True,
# font_color="#cccccc",
filter_menu=False,
)
net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
net.show(graph_output_directory)
with open(graph_output_directory, 'r') as file:
html_content = file.read()
html_content = html_content.replace("'", "\"")
iframe = f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; geolocation; microphone; camera;
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
allow-scripts allow-same-origin allow-popups
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
allowpaymentrequest="" frameborder="0" srcdoc='{html_content}'></iframe>"""
return iframe
def execute_process(file, edge): # Define a function to execute the process
candidate_sentences = read_pdf(file) # Read the PDF file
if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']: # Only execute if kg_df is not defined or if the file is not consistent with the persisted global
entity_pairs = [] # Initialize an empty list for the entity pairs
for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
entity_pairs.append(get_entities(i)) # Append the entities to the list
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence
source = [i[0] for i in entity_pairs] # Extract the subjects
target = [i[1] for i in entity_pairs] # Extract the objects
globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
globals()['file'] = file # Persist the file into a global variable
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts
if len(edge)==0: # If no edge is specified
G=nx.from_pandas_edgelist(kg_df, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
# plt.figure(figsize=(12,12)) # Create a figure
# pos = nx.spring_layout(G) # Get the positions of the nodes
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
# plt.savefig("graph.png") # Save the graph as a PNG
iframe = render_graph(G)
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
else: # If an edge is specified
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
# plt.figure(figsize=(12,12)) # Create a figure
# pos = nx.spring_layout(G) # Get the positions of the nodes
iframe = render_graph(G)
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
# plt.savefig("graph.png") # Save the graph as a PNG
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
inputs = [
gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
]
outputs = [
gr.HTML(label="Generated graph"), # Create an image output for the generated graph
gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
]
description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description) # Create an interface
iface.launch() # Launch the interface