import gradio as gr # Importing gradio for creating web interface import os # Importing os for operating system related tasks import fitz # Importing fitz for PDF processing import re # Importing re for regular expressions import spacy # Importing spacy for natural language processing import spacy.cli # Importing spacy's command line interface import pandas as pd # Importing pandas for data manipulation import bs4 # Importing bs4 for web scraping import requests # Importing requests for making HTTP requests from spacy import displacy # Importing displacy from spacy for visualizing NLP results from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens import networkx as nx # Importing networkx for creating, manipulating, and studying graphs import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization from tqdm import tqdm # Importing tqdm for progress bars try: nlp = spacy.load('en_core_web_sm') # Trying to load the English model except OSError: print("Model not found. Downloading...") # If model not found, print a message spacy.cli.download("en_core_web_sm") # Download the English model nlp = spacy.load('en_core_web_sm') # Load the English model def read_pdf(file): # Define a function to read a PDF file doc = fitz.open(file) # Open the PDF file text = [] # Initialize an empty list to store the text for page in doc: # For each page in the document for sentence in page.get_text("text").split('\n'): # For each sentence in the page if len(sentence) > 0: # If the sentence is not empty text.append(sentence) # Append the sentence to the list return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences def read_csv(file): # Define a function to read a CSV file candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file return candidate_sentences # Return the DataFrame def get_entities(sent): # Define a function to get entities from a sentence ent1 = "" # Initialize an empty string for the first entity ent2 = "" # Initialize an empty string for the second entity prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token prv_tok_text = "" # Initialize an empty string for the previous token prefix = "" # Initialize an empty string for the prefix modifier = "" # Initialize an empty string for the modifier for tok in nlp(sent): # For each token in the sentence if tok.dep_ != "punct": # If the token is not a punctuation mark if tok.dep_ == "compound": # If the token is a compound word prefix = tok.text # Set the prefix to the token text if prv_tok_dep == "compound": # If the previous token was also a compound word prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix if tok.dep_.endswith("mod") == True: # If the token is a modifier modifier = tok.text # Set the modifier to the token text if prv_tok_dep == "compound": # If the previous token was also a compound word modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier if tok.dep_.find("subj") == True: # If the token is a subject ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text prefix = "" # Reset the prefix modifier = "" # Reset the modifier prv_tok_dep = "" # Reset the dependency tag of the previous token prv_tok_text = "" # Reset the previous token if tok.dep_.find("obj") == True: # If the token is an object ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token prv_tok_text = tok.text # Update the previous token return [ent1.strip(), ent2.strip()] # Return the entities def get_relation(sent): # Define a function to get the relation from a sentence doc = nlp(sent) # Process the sentence matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"}, {'DEP':'agent','OP':"?"}, {'POS':'ADJ','OP':"?"}] # Define a pattern matcher.add("matching_1", [pattern]) # Add the pattern to the matcher matches = matcher(doc) # Match the pattern in the document if matches: # If there are matches k = len(matches) - 1 # Get the index of the last match span = doc[matches[k][1]:matches[k][2]] # Get the span of the match return span.text # Return the text of the span else: return "" # If there are no matches, return an empty string def render_graph(G): from pyvis.network import Network graph_output_directory = "./docs/index.html" net = Network( notebook=False, # bgcolor="#1a1a1a", cdn_resources="remote", height="900px", width="100%", select_menu=True, # font_color="#cccccc", filter_menu=False, ) net.from_nx(G) # net.repulsion(node_distance=150, spring_length=400) net.force_atlas_2based(central_gravity=0.015, gravity=-31) # net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380) net.show_buttons(filter_=["physics"]) net.show(graph_output_directory) with open(graph_output_directory, 'r') as file: html_content = file.read() html_content = html_content.replace("'", "\"") iframe = f"""""" return iframe def execute_process(file, edge): # Define a function to execute the process candidate_sentences = read_pdf(file) # Read the PDF file if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']: # Only execute if kg_df is not defined or if the file is not consistent with the persisted global entity_pairs = [] # Initialize an empty list for the entity pairs for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame entity_pairs.append(get_entities(i)) # Append the entities to the list relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence source = [i[0] for i in entity_pairs] # Extract the subjects target = [i[1] for i in entity_pairs] # Extract the objects globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges globals()['file'] = file # Persist the file into a global variable unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts if len(edge)==0: # If no edge is specified G=nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame # plt.figure(figsize=(12,12)) # Create a figure # pos = nx.spring_layout(G) # Get the positions of the nodes # nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph # plt.savefig("graph.png") # Save the graph as a PNG iframe = render_graph(G) return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges else: # If an edge is specified G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge # plt.figure(figsize=(12,12)) # Create a figure # pos = nx.spring_layout(G) # Get the positions of the nodes iframe = render_graph(G) # nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph # plt.savefig("graph.png") # Save the graph as a PNG return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges inputs = [ gr.File(label="Upload PDF"), # Create a file input for uploading a PDF gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge ] outputs = [ gr.HTML(label="Generated graph"), # Create an image output for the generated graph gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges ] description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.' iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description) # Create an interface iface.launch() # Launch the interface