Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph

Sleeping

File size: 8,368 Bytes

import gradio as gr  # Importing gradio for creating web interface
import os  # Importing os for operating system related tasks
import fitz  # Importing fitz for PDF processing
import re  # Importing re for regular expressions
import spacy  # Importing spacy for natural language processing
import spacy.cli  # Importing spacy's command line interface
import pandas as pd  # Importing pandas for data manipulation
import bs4  # Importing bs4 for web scraping
import requests  # Importing requests for making HTTP requests
from spacy import displacy  # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher  # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span  # Importing Span from spacy for handling spans of tokens
import networkx as nx  # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt  # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm  # Importing tqdm for progress bars

nlp = spacy.load('en_core_web_sm')  # Loading the English model

try:
    nlp = spacy.load('en_core_web_sm')  # Trying to load the English model
except OSError:
    print("Model not found. Downloading...")  # If model not found, print a message
    spacy.cli.download("en_core_web_sm")  # Download the English model
    nlp = spacy.load('en_core_web_sm')  # Load the English model

def read_pdf(file):  # Define a function to read a PDF file
    doc = fitz.open(file)  # Open the PDF file
    text = []  # Initialize an empty list to store the text
    for page in doc:  # For each page in the document
        for sentence in page.get_text("text").split('\n'):  # For each sentence in the page
            if len(sentence) > 0:  # If the sentence is not empty
                text.append(sentence)  # Append the sentence to the list
    return pd.DataFrame(text, columns=["sentence"])  # Return a DataFrame of the sentences

def read_csv(file):  # Define a function to read a CSV file
    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")  # Read the CSV file
    return candidate_sentences  # Return the DataFrame

def get_entities(sent):  # Define a function to get entities from a sentence
  ent1 = ""  # Initialize an empty string for the first entity
  ent2 = ""  # Initialize an empty string for the second entity

  prv_tok_dep = ""  # Initialize an empty string for the dependency tag of the previous token
  prv_tok_text = ""  # Initialize an empty string for the previous token

  prefix = ""  # Initialize an empty string for the prefix
  modifier = ""  # Initialize an empty string for the modifier

  for tok in nlp(sent):  # For each token in the sentence
    if tok.dep_ != "punct":  # If the token is not a punctuation mark
      if tok.dep_ == "compound":  # If the token is a compound word
        prefix = tok.text  # Set the prefix to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          prefix = prv_tok_text + " "+ tok.text  # Add the current word to the prefix
      
      if tok.dep_.endswith("mod") == True:  # If the token is a modifier
        modifier = tok.text  # Set the modifier to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          modifier = prv_tok_text + " "+ tok.text  # Add the current word to the modifier
      
      if tok.dep_.find("subj") == True:  # If the token is a subject
        ent1 = modifier +" "+ prefix + " "+ tok.text  # Set the first entity to the modifier, prefix, and token text
        prefix = ""  # Reset the prefix
        modifier = ""  # Reset the modifier
        prv_tok_dep = ""  # Reset the dependency tag of the previous token
        prv_tok_text = ""  # Reset the previous token

      if tok.dep_.find("obj") == True:  # If the token is an object
        ent2 = modifier +" "+ prefix +" "+ tok.text  # Set the second entity to the modifier, prefix, and token text
        
      prv_tok_dep = tok.dep_  # Update the dependency tag of the previous token
      prv_tok_text = tok.text  # Update the previous token

  return [ent1.strip(), ent2.strip()]  # Return the entities

def get_relation(sent):  # Define a function to get the relation from a sentence
    doc = nlp(sent)  # Process the sentence
    matcher = Matcher(nlp.vocab)  # Initialize a Matcher with the vocabulary
    pattern = [{'DEP':'ROOT'}, 
               {'DEP':'prep','OP':"?"},
               {'DEP':'agent','OP':"?"},  
               {'POS':'ADJ','OP':"?"}]  # Define a pattern
    matcher.add("matching_1", [pattern])  # Add the pattern to the matcher
    matches = matcher(doc)  # Match the pattern in the document
    if matches:  # If there are matches
        k = len(matches) - 1  # Get the index of the last match
        span = doc[matches[k][1]:matches[k][2]]  # Get the span of the match
        return span.text  # Return the text of the span
    else:
        return ""  # If there are no matches, return an empty string

def execute_process(file, edge):  # Define a function to execute the process
    candidate_sentences = read_pdf(file)  # Read the PDF file
    
    entity_pairs = []  # Initialize an empty list for the entity pairs
    for i in tqdm(candidate_sentences["sentence"]):  # For each sentence in the DataFrame
        entity_pairs.append(get_entities(i))  # Append the entities to the list
    relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]  # Get the relations for each sentence

    source = [i[0] for i in entity_pairs]  # Extract the subjects
    target = [i[1] for i in entity_pairs]  # Extract the objects
    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})  # Create a DataFrame of the sources, targets, and edges

    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None  # Get the unique edges
    edge_counts = kg_df['edge'].value_counts()  # Get the counts of the edges
    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})  # Create a DataFrame of the unique edges and their counts
    
    if len(edge)==0:  # If no edge is specified
        G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                        edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame
        plt.figure(figsize=(12,12))  # Create a figure
        pos = nx.spring_layout(G)  # Get the positions of the nodes
        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        plt.savefig("graph.png")  # Save the graph as a PNG
        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
      
    else:  # If an edge is specified
        G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", 
                            edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame with the specified edge
        plt.figure(figsize=(12,12))  # Create a figure
        pos = nx.spring_layout(G)  # Get the positions of the nodes
        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        plt.savefig("graph.png")  # Save the graph as a PNG
        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
    
inputs = [
    gr.File(label="Upload PDF"),  # Create a file input for uploading a PDF
    gr.Textbox(label="Graph a particular edge", type="text")  # Create a textbox input for specifying an edge
]

outputs = [
    gr.Image(label="Generated graph"),  # Create an image output for the generated graph
    gr.Dataframe(label="Unique edges", type="pandas")  # Create a DataFrame output for the unique edges
]

description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)  # Create an interface
iface.launch()  # Launch the interface