File size: 9,882 Bytes
02cc2d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b07841
399c03f
02cc2d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr  # Importing gradio for creating web interface
import os  # Importing os for operating system related tasks
import fitz  # Importing fitz for PDF processing
import re  # Importing re for regular expressions
import spacy  # Importing spacy for natural language processing
import spacy.cli  # Importing spacy's command line interface
import pandas as pd  # Importing pandas for data manipulation
import bs4  # Importing bs4 for web scraping
import requests  # Importing requests for making HTTP requests
from spacy import displacy  # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher  # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span  # Importing Span from spacy for handling spans of tokens
import networkx as nx  # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt  # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm  # Importing tqdm for progress bars

try:
    nlp = spacy.load('en_core_web_sm')  # Trying to load the English model
except OSError:
    print("Model not found. Downloading...")  # If model not found, print a message
    spacy.cli.download("en_core_web_sm")  # Download the English model
    nlp = spacy.load('en_core_web_sm')  # Load the English model

def read_pdf(file):  # Define a function to read a PDF file
    doc = fitz.open(file)  # Open the PDF file
    text = []  # Initialize an empty list to store the text
    for page in doc:  # For each page in the document
        for sentence in page.get_text("text").split('\n'):  # For each sentence in the page
            if len(sentence) > 0:  # If the sentence is not empty
                text.append(sentence)  # Append the sentence to the list
    return pd.DataFrame(text, columns=["sentence"])  # Return a DataFrame of the sentences

def read_csv(file):  # Define a function to read a CSV file
    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")  # Read the CSV file
    return candidate_sentences  # Return the DataFrame

def get_entities(sent):  # Define a function to get entities from a sentence
  ent1 = ""  # Initialize an empty string for the first entity
  ent2 = ""  # Initialize an empty string for the second entity

  prv_tok_dep = ""  # Initialize an empty string for the dependency tag of the previous token
  prv_tok_text = ""  # Initialize an empty string for the previous token

  prefix = ""  # Initialize an empty string for the prefix
  modifier = ""  # Initialize an empty string for the modifier

  for tok in nlp(sent):  # For each token in the sentence
    if tok.dep_ != "punct":  # If the token is not a punctuation mark
      if tok.dep_ == "compound":  # If the token is a compound word
        prefix = tok.text  # Set the prefix to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          prefix = prv_tok_text + " "+ tok.text  # Add the current word to the prefix
      
      if tok.dep_.endswith("mod") == True:  # If the token is a modifier
        modifier = tok.text  # Set the modifier to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          modifier = prv_tok_text + " "+ tok.text  # Add the current word to the modifier
      
      if tok.dep_.find("subj") == True:  # If the token is a subject
        ent1 = modifier +" "+ prefix + " "+ tok.text  # Set the first entity to the modifier, prefix, and token text
        prefix = ""  # Reset the prefix
        modifier = ""  # Reset the modifier
        prv_tok_dep = ""  # Reset the dependency tag of the previous token
        prv_tok_text = ""  # Reset the previous token

      if tok.dep_.find("obj") == True:  # If the token is an object
        ent2 = modifier +" "+ prefix +" "+ tok.text  # Set the second entity to the modifier, prefix, and token text
        
      prv_tok_dep = tok.dep_  # Update the dependency tag of the previous token
      prv_tok_text = tok.text  # Update the previous token

  return [ent1.strip(), ent2.strip()]  # Return the entities

def get_relation(sent):  # Define a function to get the relation from a sentence
    doc = nlp(sent)  # Process the sentence
    matcher = Matcher(nlp.vocab)  # Initialize a Matcher with the vocabulary
    pattern = [{'DEP':'ROOT'}, 
               {'DEP':'prep','OP':"?"},
               {'DEP':'agent','OP':"?"},  
               {'POS':'ADJ','OP':"?"}]  # Define a pattern
    matcher.add("matching_1", [pattern])  # Add the pattern to the matcher
    matches = matcher(doc)  # Match the pattern in the document
    if matches:  # If there are matches
        k = len(matches) - 1  # Get the index of the last match
        span = doc[matches[k][1]:matches[k][2]]  # Get the span of the match
        return span.text  # Return the text of the span
    else:
        return ""  # If there are no matches, return an empty string

def render_graph(G):
  from pyvis.network import Network
  graph_output_directory = "./docs/index.html"
  net = Network(
      notebook=False,
      # bgcolor="#1a1a1a",
      cdn_resources="remote",
      height="900px",
      width="100%",
      select_menu=True,
      # font_color="#cccccc",
      filter_menu=False,
  )
  net.from_nx(G)
  # net.repulsion(node_distance=150, spring_length=400)
  net.force_atlas_2based(central_gravity=0.015, gravity=-31)
  # net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
  net.show_buttons(filter_=["physics"])
  
  net.show(graph_output_directory)
  
  with open(graph_output_directory, 'r') as file:
      html_content = file.read()
  
  html_content = html_content.replace("'", "\"")
      
  iframe = f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; geolocation; microphone; camera; 
  display-capture; encrypted-media;" sandbox="allow-modals allow-forms 
  allow-scripts allow-same-origin allow-popups 
  allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" 
  allowpaymentrequest="" frameborder="0" srcdoc='{html_content}'></iframe>"""
  
  return iframe

def execute_process(file, edge):  # Define a function to execute the process
    candidate_sentences = read_pdf(file)  # Read the PDF file
    
    if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']:  # Only execute if kg_df is not defined or if the file is not consistent with the persisted global
        entity_pairs = []  # Initialize an empty list for the entity pairs
        for i in tqdm(candidate_sentences["sentence"]):  # For each sentence in the DataFrame
            entity_pairs.append(get_entities(i))  # Append the entities to the list
        relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]  # Get the relations for each sentence

        source = [i[0] for i in entity_pairs]  # Extract the subjects
        target = [i[1] for i in entity_pairs]  # Extract the objects
        globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations})  # Create a DataFrame of the sources, targets, and edges
        globals()['file'] = file  # Persist the file into a global variable
    
    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None  # Get the unique edges
    edge_counts = kg_df['edge'].value_counts()  # Get the counts of the edges
    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})  # Create a DataFrame of the unique edges and their counts
    
    if len(edge)==0:  # If no edge is specified
        G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                        edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame
        # plt.figure(figsize=(12,12))  # Create a figure
        # pos = nx.spring_layout(G)  # Get the positions of the nodes
        # nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        # plt.savefig("graph.png")  # Save the graph as a PNG
        iframe = render_graph(G)
        return iframe, unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
    else:  # If an edge is specified
        G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", 
                            edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame with the specified edge
        # plt.figure(figsize=(12,12))  # Create a figure
        # pos = nx.spring_layout(G)  # Get the positions of the nodes
        iframe = render_graph(G)
                
        # nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        # plt.savefig("graph.png")  # Save the graph as a PNG
        return iframe, unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
    
inputs = [
    gr.File(label="Upload PDF"),  # Create a file input for uploading a PDF
    gr.Textbox(label="Graph a particular edge", type="text")  # Create a textbox input for specifying an edge
]

outputs = [
    gr.HTML(label="Generated graph"),  # Create an image output for the generated graph
    gr.Dataframe(label="Unique edges", type="pandas")  # Create a DataFrame output for the unique edges
]

description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description)  # Create an interface
iface.launch()  # Launch the interface