Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph

Sleeping

App Files Files Community

Chris Finlayson commited on Dec 8, 2023

Commit

26f7059

1 Parent(s): ee9fa1c

Functional app

Browse files

Files changed (2) hide show

app.py +121 -155
graph.png +0 -0

app.py CHANGED Viewed

@@ -1,172 +1,138 @@
-import gradio as gr
-import os
-import fitz
-import re
-import spacy
-import spacy.cli
-import re
-import pandas as pd
-import bs4
-import requests
-import spacy
-from spacy import displacy
-nlp = spacy.load('en_core_web_sm')
-from spacy.matcher import Matcher
-from spacy.tokens import Span
-import networkx as nx
-import matplotlib.pyplot as plt
-from tqdm import tqdm
 try:
-    nlp = spacy.load('en_core_web_sm')
 except OSError:
-    print("Model not found. Downloading...")
-    spacy.cli.download("en_core_web_sm")
-    nlp = spacy.load('en_core_web_sm')
-# def read_pdf(file):
-#     doc = fitz.open(file)
-#     text = ""
-#     for page in doc:
-#         text += page.get_text("text").split('\n')
-#     return text
-def read_csv(file):
-    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")
-    return candidate_sentences.shape
-def get_entities(sent):
-  ## chunk 1
-  ent1 = ""
-  ent2 = ""
-  prv_tok_dep = ""    # dependency tag of previous token in the sentence
-  prv_tok_text = ""   # previous token in the sentence
-  prefix = ""
-  modifier = ""
-  #############################################################
-  for tok in nlp(sent):
-    ## chunk 2
-    # if token is a punctuation mark then move on to the next token
-    if tok.dep_ != "punct":
-      # check: token is a compound word or not
-      if tok.dep_ == "compound":
-        prefix = tok.text
-        # if the previous word was also a 'compound' then add the current word to it
-        if prv_tok_dep == "compound":
-          prefix = prv_tok_text + " "+ tok.text
-      # check: token is a modifier or not
-      if tok.dep_.endswith("mod") == True:
-        modifier = tok.text
-        # if the previous word was also a 'compound' then add the current word to it
-        if prv_tok_dep == "compound":
-          modifier = prv_tok_text + " "+ tok.text
-      ## chunk 3
-      if tok.dep_.find("subj") == True:
-        ent1 = modifier +" "+ prefix + " "+ tok.text
-        prefix = ""
-        modifier = ""
-        prv_tok_dep = ""
-        prv_tok_text = ""
-      ## chunk 4
-      if tok.dep_.find("obj") == True:
-        ent2 = modifier +" "+ prefix +" "+ tok.text
-      ## chunk 5
-      # update variables
-      prv_tok_dep = tok.dep_
-      prv_tok_text = tok.text
-  #############################################################
-  return [ent1.strip(), ent2.strip()]
-def get_relation(sent):
-  doc = nlp(sent)
-  # Matcher class object
-  matcher = Matcher(nlp.vocab)
-  #define the pattern
-  pattern = [{'DEP':'ROOT'},
-            {'DEP':'prep','OP':"?"},
-            {'DEP':'agent','OP':"?"},
-            {'POS':'ADJ','OP':"?"}]
-  matcher.add("matching_1", [pattern])
-  matches = matcher(doc)
-  k = len(matches) - 1
-  span = doc[matches[k][1]:matches[k][2]]
-  return(span.text)
-def ulify(elements):
-    string = "<ul>\n"
-    string += "\n".join(["<li>" + str(s) + "</li>" for s in elements])
-    string += "\n</ul>"
-    return string
-def execute_process(file, edge):
-    # candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences'])
-    candidate_sentences = pd.read_csv(file)
-    entity_pairs = []
-    for i in tqdm(candidate_sentences["sentence"]):
-        entity_pairs.append(get_entities(i))
-    relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]
-    # extract subject
-    source = [i[0] for i in entity_pairs]
-    # extract object
-    target = [i[1] for i in entity_pairs]
-    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
-    # create a variable of all unique edges
-    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None
-    # create a dataframe of all unique edges and their counts
-    edge_counts = kg_df['edge'].value_counts()
-    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})
-    G=nx.from_pandas_edgelist(kg_df, "source", "target",
-                          edge_attr=True, create_using=nx.MultiDiGraph())
-    if edge is not None:
         G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
-                            edge_attr=True, create_using=nx.MultiDiGraph())
-        plt.figure(figsize=(12,12))
-        pos = nx.spring_layout(G)
-        nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
-        plt.savefig("graph.png")
-        # return "graph.png", "\n".join(unique_edges)
-        return "graph.png", unique_edges_df
-    else:
-        plt.figure(figsize=(12,12))
-        pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
-        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
-        plt.savefig("graph.png")
-        # return "graph.png", "\n".join(unique_edges)
-        return "graph.png", unique_edges_df
 inputs = [
-    gr.File(label="Upload PDF"),
-    gr.Textbox(label="Graph a particular edge", type="text")
 ]
 outputs = [
-    gr.Image(label="Generated graph"),
-    gr.Dataframe(label="Unique edges", type="pandas")
 ]
-description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within'
-iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)
-iface.launch()

+import gradio as gr  # Importing gradio for creating web interface
+import os  # Importing os for operating system related tasks
+import fitz  # Importing fitz for PDF processing
+import re  # Importing re for regular expressions
+import spacy  # Importing spacy for natural language processing
+import spacy.cli  # Importing spacy's command line interface
+import pandas as pd  # Importing pandas for data manipulation
+import bs4  # Importing bs4 for web scraping
+import requests  # Importing requests for making HTTP requests
+from spacy import displacy  # Importing displacy from spacy for visualizing NLP results
+from spacy.matcher import Matcher  # Importing Matcher from spacy for matching patterns
+from spacy.tokens import Span  # Importing Span from spacy for handling spans of tokens
+import networkx as nx  # Importing networkx for creating, manipulating, and studying graphs
+import matplotlib.pyplot as plt  # Importing matplotlib.pyplot for data visualization
+from tqdm import tqdm  # Importing tqdm for progress bars
+nlp = spacy.load('en_core_web_sm')  # Loading the English model
 try:
+    nlp = spacy.load('en_core_web_sm')  # Trying to load the English model
 except OSError:
+    print("Model not found. Downloading...")  # If model not found, print a message
+    spacy.cli.download("en_core_web_sm")  # Download the English model
+    nlp = spacy.load('en_core_web_sm')  # Load the English model
+def read_pdf(file):  # Define a function to read a PDF file
+    doc = fitz.open(file)  # Open the PDF file
+    text = []  # Initialize an empty list to store the text
+    for page in doc:  # For each page in the document
+        for sentence in page.get_text("text").split('\n'):  # For each sentence in the page
+            if len(sentence) > 0:  # If the sentence is not empty
+                text.append(sentence)  # Append the sentence to the list
+    return pd.DataFrame(text, columns=["sentence"])  # Return a DataFrame of the sentences
+def read_csv(file):  # Define a function to read a CSV file
+    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")  # Read the CSV file
+    return candidate_sentences  # Return the DataFrame
+def get_entities(sent):  # Define a function to get entities from a sentence
+  ent1 = ""  # Initialize an empty string for the first entity
+  ent2 = ""  # Initialize an empty string for the second entity
+  prv_tok_dep = ""  # Initialize an empty string for the dependency tag of the previous token
+  prv_tok_text = ""  # Initialize an empty string for the previous token
+  prefix = ""  # Initialize an empty string for the prefix
+  modifier = ""  # Initialize an empty string for the modifier
+  for tok in nlp(sent):  # For each token in the sentence
+    if tok.dep_ != "punct":  # If the token is not a punctuation mark
+      if tok.dep_ == "compound":  # If the token is a compound word
+        prefix = tok.text  # Set the prefix to the token text
+        if prv_tok_dep == "compound":  # If the previous token was also a compound word
+          prefix = prv_tok_text + " "+ tok.text  # Add the current word to the prefix
+      if tok.dep_.endswith("mod") == True:  # If the token is a modifier
+        modifier = tok.text  # Set the modifier to the token text
+        if prv_tok_dep == "compound":  # If the previous token was also a compound word
+          modifier = prv_tok_text + " "+ tok.text  # Add the current word to the modifier
+      if tok.dep_.find("subj") == True:  # If the token is a subject
+        ent1 = modifier +" "+ prefix + " "+ tok.text  # Set the first entity to the modifier, prefix, and token text
+        prefix = ""  # Reset the prefix
+        modifier = ""  # Reset the modifier
+        prv_tok_dep = ""  # Reset the dependency tag of the previous token
+        prv_tok_text = ""  # Reset the previous token
+      if tok.dep_.find("obj") == True:  # If the token is an object
+        ent2 = modifier +" "+ prefix +" "+ tok.text  # Set the second entity to the modifier, prefix, and token text
+      prv_tok_dep = tok.dep_  # Update the dependency tag of the previous token
+      prv_tok_text = tok.text  # Update the previous token
+  return [ent1.strip(), ent2.strip()]  # Return the entities
+def get_relation(sent):  # Define a function to get the relation from a sentence
+    doc = nlp(sent)  # Process the sentence
+    matcher = Matcher(nlp.vocab)  # Initialize a Matcher with the vocabulary
+    pattern = [{'DEP':'ROOT'},
+               {'DEP':'prep','OP':"?"},
+               {'DEP':'agent','OP':"?"},
+               {'POS':'ADJ','OP':"?"}]  # Define a pattern
+    matcher.add("matching_1", [pattern])  # Add the pattern to the matcher
+    matches = matcher(doc)  # Match the pattern in the document
+    if matches:  # If there are matches
+        k = len(matches) - 1  # Get the index of the last match
+        span = doc[matches[k][1]:matches[k][2]]  # Get the span of the match
+        return span.text  # Return the text of the span
+    else:
+        return ""  # If there are no matches, return an empty string
+def execute_process(file, edge):  # Define a function to execute the process
+    candidate_sentences = read_pdf(file)  # Read the PDF file
+    entity_pairs = []  # Initialize an empty list for the entity pairs
+    for i in tqdm(candidate_sentences["sentence"]):  # For each sentence in the DataFrame
+        entity_pairs.append(get_entities(i))  # Append the entities to the list
+    relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]  # Get the relations for each sentence
+    source = [i[0] for i in entity_pairs]  # Extract the subjects
+    target = [i[1] for i in entity_pairs]  # Extract the objects
+    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})  # Create a DataFrame of the sources, targets, and edges
+    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None  # Get the unique edges
+    edge_counts = kg_df['edge'].value_counts()  # Get the counts of the edges
+    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})  # Create a DataFrame of the unique edges and their counts
+    if len(edge)==0:  # If no edge is specified
+        G=nx.from_pandas_edgelist(kg_df, "source", "target",
+                        edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame
+        plt.figure(figsize=(12,12))  # Create a figure
+        pos = nx.spring_layout(G)  # Get the positions of the nodes
+        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
+        plt.savefig("graph.png")  # Save the graph as a PNG
+        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
+    else:  # If an edge is specified
         G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
+                            edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame with the specified edge
+        plt.figure(figsize=(12,12))  # Create a figure
+        pos = nx.spring_layout(G)  # Get the positions of the nodes
+        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
+        plt.savefig("graph.png")  # Save the graph as a PNG
+        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
 inputs = [
+    gr.File(label="Upload PDF"),  # Create a file input for uploading a PDF
+    gr.Textbox(label="Graph a particular edge", type="text")  # Create a textbox input for specifying an edge
 ]
 outputs = [
+    gr.Image(label="Generated graph"),  # Create an image output for the generated graph
+    gr.Dataframe(label="Unique edges", type="pandas")  # Create a DataFrame output for the unique edges
 ]
+description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
+iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)  # Create an interface
+iface.launch()  # Launch the interface

graph.png CHANGED Viewed