Chris Finlayson
commited on
Commit
·
26f7059
1
Parent(s):
ee9fa1c
Functional app
Browse files
app.py
CHANGED
@@ -1,172 +1,138 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import os
|
3 |
-
import fitz
|
4 |
-
import re
|
5 |
-
import spacy
|
6 |
-
import spacy.cli
|
7 |
-
import
|
8 |
-
import
|
9 |
-
import
|
10 |
-
import
|
11 |
-
import spacy
|
12 |
-
from spacy import
|
13 |
-
|
14 |
-
|
15 |
-
from
|
16 |
-
|
17 |
-
|
18 |
-
from tqdm import tqdm
|
19 |
|
20 |
try:
|
21 |
-
nlp = spacy.load('en_core_web_sm')
|
22 |
except OSError:
|
23 |
-
print("Model not found. Downloading...")
|
24 |
-
spacy.cli.download("en_core_web_sm")
|
25 |
-
nlp = spacy.load('en_core_web_sm')
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
#
|
31 |
-
|
32 |
-
#
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
ent1 = ""
|
42 |
-
ent2 = ""
|
43 |
-
|
44 |
-
prv_tok_dep = ""
|
45 |
-
prv_tok_text = ""
|
46 |
-
|
47 |
-
prefix = ""
|
48 |
-
modifier = ""
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
# check: token is a compound word or not
|
57 |
-
if tok.dep_ == "compound":
|
58 |
-
prefix = tok.text
|
59 |
-
# if the previous word was also a 'compound' then add the current word to it
|
60 |
-
if prv_tok_dep == "compound":
|
61 |
-
prefix = prv_tok_text + " "+ tok.text
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
if prv_tok_dep == "compound":
|
68 |
-
modifier = prv_tok_text + " "+ tok.text
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
if tok.dep_.find("obj") == True:
|
80 |
-
ent2 = modifier +" "+ prefix +" "+ tok.text
|
81 |
|
82 |
-
|
83 |
-
#
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
-
#
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
matcher.add("matching_1", [pattern])
|
104 |
-
|
105 |
-
matches = matcher(doc)
|
106 |
-
k = len(matches) - 1
|
107 |
-
|
108 |
-
span = doc[matches[k][1]:matches[k][2]]
|
109 |
-
|
110 |
-
return(span.text)
|
111 |
-
|
112 |
-
def ulify(elements):
|
113 |
-
string = "<ul>\n"
|
114 |
-
string += "\n".join(["<li>" + str(s) + "</li>" for s in elements])
|
115 |
-
string += "\n</ul>"
|
116 |
-
return string
|
117 |
-
|
118 |
-
def execute_process(file, edge):
|
119 |
-
# candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences'])
|
120 |
-
candidate_sentences = pd.read_csv(file)
|
121 |
-
|
122 |
-
entity_pairs = []
|
123 |
-
for i in tqdm(candidate_sentences["sentence"]):
|
124 |
-
entity_pairs.append(get_entities(i))
|
125 |
-
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]
|
126 |
-
# extract subject
|
127 |
-
source = [i[0] for i in entity_pairs]
|
128 |
-
|
129 |
-
# extract object
|
130 |
-
target = [i[1] for i in entity_pairs]
|
131 |
-
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
# create a dataframe of all unique edges and their counts
|
136 |
-
edge_counts = kg_df['edge'].value_counts()
|
137 |
-
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})
|
138 |
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
-
if edge is
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
|
144 |
-
edge_attr=True, create_using=nx.MultiDiGraph())
|
145 |
-
plt.figure(figsize=(12,12))
|
146 |
-
pos = nx.spring_layout(G)
|
147 |
-
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
|
148 |
-
plt.savefig("graph.png")
|
149 |
-
|
150 |
-
return "graph.png", unique_edges_df
|
151 |
-
|
152 |
-
else:
|
153 |
-
plt.figure(figsize=(12,12))
|
154 |
-
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
|
155 |
-
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
|
156 |
-
plt.savefig("graph.png")
|
157 |
-
# return "graph.png", "\n".join(unique_edges)
|
158 |
-
return "graph.png", unique_edges_df
|
159 |
|
160 |
inputs = [
|
161 |
-
gr.File(label="Upload PDF"),
|
162 |
-
gr.Textbox(label="Graph a particular edge", type="text")
|
163 |
]
|
164 |
|
165 |
outputs = [
|
166 |
-
gr.Image(label="Generated graph"),
|
167 |
-
gr.Dataframe(label="Unique edges", type="pandas")
|
168 |
]
|
169 |
|
170 |
-
description = 'This
|
171 |
-
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)
|
172 |
-
iface.launch()
|
|
|
1 |
+
import gradio as gr # Importing gradio for creating web interface
|
2 |
+
import os # Importing os for operating system related tasks
|
3 |
+
import fitz # Importing fitz for PDF processing
|
4 |
+
import re # Importing re for regular expressions
|
5 |
+
import spacy # Importing spacy for natural language processing
|
6 |
+
import spacy.cli # Importing spacy's command line interface
|
7 |
+
import pandas as pd # Importing pandas for data manipulation
|
8 |
+
import bs4 # Importing bs4 for web scraping
|
9 |
+
import requests # Importing requests for making HTTP requests
|
10 |
+
from spacy import displacy # Importing displacy from spacy for visualizing NLP results
|
11 |
+
from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
|
12 |
+
from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
|
13 |
+
import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
|
14 |
+
import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
|
15 |
+
from tqdm import tqdm # Importing tqdm for progress bars
|
16 |
+
|
17 |
+
nlp = spacy.load('en_core_web_sm') # Loading the English model
|
|
|
18 |
|
19 |
try:
|
20 |
+
nlp = spacy.load('en_core_web_sm') # Trying to load the English model
|
21 |
except OSError:
|
22 |
+
print("Model not found. Downloading...") # If model not found, print a message
|
23 |
+
spacy.cli.download("en_core_web_sm") # Download the English model
|
24 |
+
nlp = spacy.load('en_core_web_sm') # Load the English model
|
25 |
+
|
26 |
+
def read_pdf(file): # Define a function to read a PDF file
|
27 |
+
doc = fitz.open(file) # Open the PDF file
|
28 |
+
text = [] # Initialize an empty list to store the text
|
29 |
+
for page in doc: # For each page in the document
|
30 |
+
for sentence in page.get_text("text").split('\n'): # For each sentence in the page
|
31 |
+
if len(sentence) > 0: # If the sentence is not empty
|
32 |
+
text.append(sentence) # Append the sentence to the list
|
33 |
+
return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences
|
34 |
+
|
35 |
+
def read_csv(file): # Define a function to read a CSV file
|
36 |
+
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
|
37 |
+
return candidate_sentences # Return the DataFrame
|
38 |
+
|
39 |
+
def get_entities(sent): # Define a function to get entities from a sentence
|
40 |
+
ent1 = "" # Initialize an empty string for the first entity
|
41 |
+
ent2 = "" # Initialize an empty string for the second entity
|
42 |
+
|
43 |
+
prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
|
44 |
+
prv_tok_text = "" # Initialize an empty string for the previous token
|
45 |
+
|
46 |
+
prefix = "" # Initialize an empty string for the prefix
|
47 |
+
modifier = "" # Initialize an empty string for the modifier
|
48 |
+
|
49 |
+
for tok in nlp(sent): # For each token in the sentence
|
50 |
+
if tok.dep_ != "punct": # If the token is not a punctuation mark
|
51 |
+
if tok.dep_ == "compound": # If the token is a compound word
|
52 |
+
prefix = tok.text # Set the prefix to the token text
|
53 |
+
if prv_tok_dep == "compound": # If the previous token was also a compound word
|
54 |
+
prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
if tok.dep_.endswith("mod") == True: # If the token is a modifier
|
57 |
+
modifier = tok.text # Set the modifier to the token text
|
58 |
+
if prv_tok_dep == "compound": # If the previous token was also a compound word
|
59 |
+
modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier
|
|
|
|
|
60 |
|
61 |
+
if tok.dep_.find("subj") == True: # If the token is a subject
|
62 |
+
ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
|
63 |
+
prefix = "" # Reset the prefix
|
64 |
+
modifier = "" # Reset the modifier
|
65 |
+
prv_tok_dep = "" # Reset the dependency tag of the previous token
|
66 |
+
prv_tok_text = "" # Reset the previous token
|
67 |
+
|
68 |
+
if tok.dep_.find("obj") == True: # If the token is an object
|
69 |
+
ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text
|
|
|
|
|
70 |
|
71 |
+
prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
|
72 |
+
prv_tok_text = tok.text # Update the previous token
|
73 |
+
|
74 |
+
return [ent1.strip(), ent2.strip()] # Return the entities
|
75 |
+
|
76 |
+
def get_relation(sent): # Define a function to get the relation from a sentence
|
77 |
+
doc = nlp(sent) # Process the sentence
|
78 |
+
matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
|
79 |
+
pattern = [{'DEP':'ROOT'},
|
80 |
+
{'DEP':'prep','OP':"?"},
|
81 |
+
{'DEP':'agent','OP':"?"},
|
82 |
+
{'POS':'ADJ','OP':"?"}] # Define a pattern
|
83 |
+
matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
|
84 |
+
matches = matcher(doc) # Match the pattern in the document
|
85 |
+
if matches: # If there are matches
|
86 |
+
k = len(matches) - 1 # Get the index of the last match
|
87 |
+
span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
|
88 |
+
return span.text # Return the text of the span
|
89 |
+
else:
|
90 |
+
return "" # If there are no matches, return an empty string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
def execute_process(file, edge): # Define a function to execute the process
|
93 |
+
candidate_sentences = read_pdf(file) # Read the PDF file
|
|
|
|
|
|
|
94 |
|
95 |
+
entity_pairs = [] # Initialize an empty list for the entity pairs
|
96 |
+
for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
|
97 |
+
entity_pairs.append(get_entities(i)) # Append the entities to the list
|
98 |
+
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence
|
99 |
+
|
100 |
+
source = [i[0] for i in entity_pairs] # Extract the subjects
|
101 |
+
target = [i[1] for i in entity_pairs] # Extract the objects
|
102 |
+
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
|
103 |
+
|
104 |
+
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
|
105 |
+
edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
|
106 |
+
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts
|
107 |
|
108 |
+
if len(edge)==0: # If no edge is specified
|
109 |
+
G=nx.from_pandas_edgelist(kg_df, "source", "target",
|
110 |
+
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
|
111 |
+
plt.figure(figsize=(12,12)) # Create a figure
|
112 |
+
pos = nx.spring_layout(G) # Get the positions of the nodes
|
113 |
+
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
|
114 |
+
plt.savefig("graph.png") # Save the graph as a PNG
|
115 |
+
return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
|
116 |
+
|
117 |
+
else: # If an edge is specified
|
118 |
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
|
119 |
+
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
|
120 |
+
plt.figure(figsize=(12,12)) # Create a figure
|
121 |
+
pos = nx.spring_layout(G) # Get the positions of the nodes
|
122 |
+
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
|
123 |
+
plt.savefig("graph.png") # Save the graph as a PNG
|
124 |
+
return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
inputs = [
|
127 |
+
gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
|
128 |
+
gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
|
129 |
]
|
130 |
|
131 |
outputs = [
|
132 |
+
gr.Image(label="Generated graph"), # Create an image output for the generated graph
|
133 |
+
gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
|
134 |
]
|
135 |
|
136 |
+
description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
|
137 |
+
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description) # Create an interface
|
138 |
+
iface.launch() # Launch the interface
|
graph.png
CHANGED
![]() |
![]() |