Spaces:

jhatchett
/

Words2Wisdom

Running

App Files Files Community

johaunh commited on Feb 21, 2024

Commit

3273f67

1 Parent(s): 34ecdc2

Add streamlit app

Browse files

Files changed (7) hide show

config/validation.yml +8 -8
src/words2wisdom/cli.py +19 -9
src/words2wisdom/gui.py +17 -14
src/words2wisdom/gui_streamlit.py +186 -0
src/words2wisdom/pipeline.py +8 -8
src/words2wisdom/validate.py +1 -1
writeup/words2wisdom_poster.pdf +0 -0

config/validation.yml CHANGED Viewed

@@ -6,19 +6,19 @@ instruction: >
   an object entity o (format: [s, r, o]). We emphasize that the order of these
   components is significant; the subject s relates to the object o via the
   relation r. That is, the relation points from the subject to the object. Our
-  AI agent extracts a collection of triplets from each passage provided. The
-  evaluation task has 5 subtasks:
 questions:
   Q1:
     title: Specificity (subject entity).
-    text: Does the subject entity represent a specific term/concept referenced in the passage?
     additional:
     options:
       - 1 = Specific
       - 0 = Not specific
   Q2:
     title: Specificity (object entity).
-    text: Does the object entity represent a specific term/concept referenced in the passage?
     additional:
     options:
       - 1 = Specific
@@ -54,8 +54,8 @@ example:
     system).
   triplet: "['organ', 'such as', 'heart']"
   answers:
-    Q1: 1 - The entity 'organ' is specific and is mentioned in the passage.
-    Q2: 1 - The entity 'heart' is specific and is mentioned in the passage.
-    Q3: 1 - The relation 'such as' is unclear. A better relation would be 'superclass of'.
     Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
-    Q5: 0 - The provided triplet is unclear as is. "Organ such as heart" doesn't make sense.

   an object entity o (format: [s, r, o]). We emphasize that the order of these
   components is significant; the subject s relates to the object o via the
   relation r. That is, the relation points from the subject to the object. Our
+  AI agent extracts a collection of triplets from each passage provided. Please
+  answer the following questions for each triplet provided.
 questions:
   Q1:
     title: Specificity (subject entity).
+    text: Is the subject entity a suitable glossary term for a textbook?
     additional:
     options:
       - 1 = Specific
       - 0 = Not specific
   Q2:
     title: Specificity (object entity).
+    text: Is the object entity a suitable glossary term for a textbook?
     additional:
     options:
       - 1 = Specific
     system).
   triplet: "['organ', 'such as', 'heart']"
   answers:
+    Q1: 1 - The entity 'organ' is specific and would likely appear in a textbook glossary.
+    Q2: 1 - The entity 'heart' is specific and would likely appear in a textbook glossary.
+    Q3: 0 - The relation 'such as' is unclear. A better relation would be 'superclass of'.
     Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
+    Q5: 0 - The provided triplet is unclear as is. "Organ such as heart" does not make sense.

src/words2wisdom/cli.py CHANGED Viewed

@@ -16,21 +16,25 @@ default_config_path = os.path.join(CONFIG_DIR, "default_config.ini")
 def main():
     parser = argparse.ArgumentParser(
         prog="words2wisdom",
-        #description="Generate a knowledge graph from a given text using OpenAI LLMs"
     )
     subparsers = parser.add_subparsers(dest="command",
                                        help="Available commands")
     # init
     parser_init = subparsers.add_parser("init",
-                                        help="Return the default config.ini file",
-                                        description="Return the default config.ini file")
     parser_init.set_defaults(func=get_default_config)
     # gui
     parser_gui = subparsers.add_parser("gui",
-                                       help="run Words2Wisdom using Gradio interface",
-                                       description="run Words2Wisdom using Gradio interface")
     parser_gui.set_defaults(func=gui)
     # run
@@ -39,10 +43,11 @@ def main():
                                        description="Generate a knowledge graph from a given text using OpenAI LLMs")
     parser_run.add_argument("text",
                             help="Path to text file")
-    parser_run.add_argument("--config",
                             help="Path to config.ini file",
                             default=default_config_path)
-    parser_run.add_argument("--output-dir",
                             help="Path to save outputs to",
                             default=OUTPUT_DIR)
     parser_run.set_defaults(func=run)
@@ -68,8 +73,13 @@ def get_default_config(args):
 def gui(args):
-    """Run Gradio interface"""
-    subprocess.run(["python", "-m", "text2kg.gui"])
 def run(args):

 def main():
     parser = argparse.ArgumentParser(
         prog="words2wisdom",
+        description="Knowledge graph generation utilities using OpenAI LLMs"
     )
     subparsers = parser.add_subparsers(dest="command",
                                        help="Available commands")
     # init
     parser_init = subparsers.add_parser("init",
+                                        usage="words2wisdom init [> PATH/TO/WRITE/CONFIG.INI]",
+                                        help="Initialize a template config.ini file",
+                                        description="Initialize a template config.ini file. Redirect to a new file using the '>' symbol.")
     parser_init.set_defaults(func=get_default_config)
     # gui
     parser_gui = subparsers.add_parser("gui",
+                                       help="Use Words2Wisdom via Gradio interface",
+                                       description="use Words2Wisdom using Gradio interface")
+    parser_gui.add_argument("-s", "--streamlit",
+                            action="store_true",
+                            help="Use Streamlit GUI instead of Gradio GUI")
     parser_gui.set_defaults(func=gui)
     # run
                                        description="Generate a knowledge graph from a given text using OpenAI LLMs")
     parser_run.add_argument("text",
                             help="Path to text file")
+    parser_run.add_argument("-c", "--config",
                             help="Path to config.ini file",
                             default=default_config_path)
+    parser_run.add_argument("-o", "--output-dir",
+                            metavar="OUTPUT_PATH",
                             help="Path to save outputs to",
                             default=OUTPUT_DIR)
     parser_run.set_defaults(func=run)
 def gui(args):
+    """Run interface"""
+    if args.streamlit:
+        cmd = "streamlit run words2wisdom/gui_streamlit.py".split()
+    else:
+        cmd = "python -m words2wisdom.gui".split()
+    subprocess.run(cmd)
 def run(args):

src/words2wisdom/gui.py CHANGED Viewed

@@ -1,18 +1,21 @@
 import os
 import gradio as gr
-from . import CONFIG_DIR, ROOT
-from .config import Config
-from .pipeline import Pipeline
-from .utils import dump_all
-example_file = (os.path.join(ROOT, "demo", "prokaryotes.txt"))
 example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
-def text2kg_from_string(openai_api_key: str, input_text: str):
     config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
     config.llm["openai_api_key"] = openai_api_key
@@ -25,15 +28,15 @@ def text2kg_from_string(openai_api_key: str, input_text: str):
     return knowledge_graph, zip_path
-def text2kg_from_file(openai_api_key: str, input_file):
     with open(input_file.name) as f:
         input_text = f.read()
-    return text2kg_from_string(openai_api_key, input_text)
-with gr.Blocks(title="Text2KG") as demo:
-    gr.Markdown("# 🧞📖 Text2KG")
     with gr.Column(variant="panel"):
         openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
@@ -64,7 +67,7 @@ with gr.Blocks(title="Text2KG") as demo:
             examples=[[None, example_text]],
             inputs=[openai_api_key, text_string],
             outputs=[output_graph, output_zip],
-            fn=text2kg_from_string,
             preprocess=False,
             postprocess=False
         )
@@ -74,13 +77,13 @@ with gr.Blocks(title="Text2KG") as demo:
             examples=[[None, example_file]],
             inputs=[openai_api_key, text_file],
             outputs=[output_graph, output_zip],
-            fn=text2kg_from_file,
             preprocess=False,
             postprocess=False
         )
-    submit_str.click(fn=text2kg_from_string, inputs=[openai_api_key, text_string], outputs=[output_graph, output_zip])
-    submit_file.click(fn=text2kg_from_file, inputs=[openai_api_key, text_file], outputs=[output_graph, output_zip])
 demo.launch(inbrowser=True, width="75%")

 import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 import gradio as gr
+from words2wisdom import CONFIG_DIR, ROOT
+from words2wisdom.config import Config
+from words2wisdom.pipeline import Pipeline
+from words2wisdom.utils import dump_all
+example_file = (os.path.join(ROOT, "demo", "example.txt"))
 example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
+def w2w_from_string(openai_api_key: str, input_text: str):
     config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
     config.llm["openai_api_key"] = openai_api_key
     return knowledge_graph, zip_path
+def w2w_from_file(openai_api_key: str, input_file):
     with open(input_file.name) as f:
         input_text = f.read()
+    return w2w_from_string(openai_api_key, input_text)
+with gr.Blocks(title="Words2Wisdom") as demo:
+    gr.Markdown("# 🧞📖 Words2Wisdom")
     with gr.Column(variant="panel"):
         openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
             examples=[[None, example_text]],
             inputs=[openai_api_key, text_string],
             outputs=[output_graph, output_zip],
+            fn=w2w_from_string,
             preprocess=False,
             postprocess=False
         )
             examples=[[None, example_file]],
             inputs=[openai_api_key, text_file],
             outputs=[output_graph, output_zip],
+            fn=w2w_from_file,
             preprocess=False,
             postprocess=False
         )
+    submit_str.click(fn=w2w_from_string, inputs=[openai_api_key, text_string], outputs=[output_graph, output_zip])
+    submit_file.click(fn=w2w_from_file, inputs=[openai_api_key, text_file], outputs=[output_graph, output_zip])
 demo.launch(inbrowser=True, width="75%")

src/words2wisdom/gui_streamlit.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import io
+import os
+import sys
+from zipfile import ZipFile
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+import pandas as pd
+import streamlit as st
+import streamlit.components.v1 as st_components
+from pandas import DataFrame
+from pyvis.network import Network
+from words2wisdom import CONFIG_DIR
+from words2wisdom.config import Config as W2WConfig
+from words2wisdom.pipeline import Pipeline
+def create_graph(df: DataFrame):
+    graph = Network(directed=True)
+    entities = pd.concat([df.subject, df.object]).unique()
+    graph.add_nodes(entities, label=entities, title=entities)
+    df_iterable = (
+        df.drop_duplicates(
+            subset=["subject", "relation", "object"]
+        )
+        .iterrows()
+    )
+    for _, row in df_iterable:
+        graph.add_edge(row.subject, row.object, label=row.relation)
+    graph.save_graph("/tmp/graph.html")
+    HtmlFile = open("/tmp/graph.html")
+    return st_components.html(HtmlFile.read(), height=625)
+@st.cache_data
+def create_zip_bytes(file_contents):
+    buffer = io.BytesIO()
+    with ZipFile(buffer, 'w') as zip_file:
+        for filename, content in file_contents.items():
+            zip_file.writestr(filename, content)
+    return buffer.getvalue()
+st.set_page_config(page_title="Words2Wisdom",
+                   page_icon="📖")
+st.title("📖 Words2Wisdom")
+st.write("Generate knowledge graphs from unstructured text using GPT.")
+# parameters
+with st.sidebar:
+    st.title("Parameters")
+    st.write("The API Key is required. Feel free to customize the other parameters, if you'd like!")
+    openai_api_key = st.text_input(
+        label="🔐 **OpenAI API Key**",
+        type="password",
+        help="Learn how to get your own [here](https://platform.openai.com/docs/api-reference/authentication)."
+    )
+    st.divider()
+    with st.expander("🚰 **Pipeline parameters**"):
+        formatter = lambda x: x.replace("_", " ").title()
+        words_per_batch = st.number_input(
+            label="Words per Batch",
+            min_value=0,
+            max_value=200,
+            value=150,
+            help="Batch text into paragraphs containing at least N words, if possible."
+        )
+        preprocess = st.selectbox(
+            label="Preprocess",
+            options=("None", "clause_deconstruction"),
+            index=1,
+            format_func=formatter,
+            help="Method for text simplification."
+        )
+        extraction = st.selectbox(
+            label="Generation",
+            options=("triplet_extraction",),
+            index=0,
+            format_func=formatter,
+            help="Method for KG generation."
+        )
+    with st.expander("🤖 **LLM parameters**"):
+        model = st.selectbox(
+            label="Model",
+            options=("gpt-3.5-turbo",),
+            index=0,
+            help="ID of the model to use."
+        )
+        temperature = st.slider(
+            label="Temperature",
+            min_value=0.0,
+            max_value=2.0,
+            value=1.0,
+            step=0.1,
+            format="%.1f",
+            help=(
+                "What sampling temperature to use."
+                " Higher values will make the output more random;"
+                " lower values will make it more focused/deterministic."
+            )
+        )
+# API Key warning
+if not openai_api_key.startswith('sk-'):
+    st.warning('Please enter your OpenAI API key.', icon='⚠️')
+# tabs
+tab1, tab2 = st.tabs(["Input Text", "File Upload"])
+# text input tab
+with tab1:
+    text1 = st.text_area(label="Enter text:")
+    submitted1 = tab1.button(label="Generate!", use_container_width=True)
+# file upload tab
+with tab2:
+    file2 = tab2.file_uploader(label="Upload text file:", type="txt")
+    submitted2 = tab2.button(key="filebtn", label="Generate!", use_container_width=True)
+# w2w config
+w2w_config = W2WConfig.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
+w2w_config.pipeline = {
+    "words_per_batch": words_per_batch,
+    "preprocess": [] if preprocess == "None" else [preprocess],
+    "extraction": extraction
+}
+w2w_config.llm["openai_api_key"] = openai_api_key
+# main logic
+if (submitted1 or submitted2) and openai_api_key.startswith("sk-"):
+    with st.status("Generating knowledge graph..."):
+        st.write("Initializing pipeline...")
+        pipe = Pipeline(w2w_config)
+        st.write("Executing pipeline...")
+        if submitted1:
+            text = text1
+        elif submitted2:
+            text = file2.read().decode()
+        text_batches, knowledge_graph = pipe.run(text)
+        st.write("Complete.")
+    st.divider()
+    kg_viz = create_graph(knowledge_graph)
+    st.error("**Warning:** The page will refresh when you download the data!", icon="🚨")
+    download = st.download_button(
+        label="Download data",
+        data=create_zip_bytes({
+            "text_batches.csv": (
+                DataFrame(text_batches, columns=["text"])
+                .to_csv(index_label="batch_id")
+            ),
+            "kg.csv": knowledge_graph.to_csv(index=False),
+            "config.ini": pipe.serialize()
+        }),
+        file_name="output.zip",
+        use_container_width=True,
+        type="primary"
+    )

src/words2wisdom/pipeline.py CHANGED Viewed

@@ -23,7 +23,7 @@ PARSERS = {
 class Module:
-    """Text2KG module class."""
     def __init__(self, name: str) -> None:
         self.name = name
         self.parser = self.get_parser()
@@ -44,7 +44,7 @@ class Module:
 class Pipeline:
-    """Text2KG pipeline class."""
     def __init__(self, config: Config):
@@ -57,7 +57,7 @@ class Pipeline:
     def __repr__(self) -> str:
-        return f"Text2KG(\n\tconfig.pipeline={self.config.pipeline}\n\tconfig.llm={self.config.llm}\n)"
     def __str__(self) -> str:
@@ -72,7 +72,7 @@ class Pipeline:
     def initialize(self, config: Config):
-        """Initialize Text2KG pipeline from config."""
         # validate preprocess
         preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
@@ -110,12 +110,12 @@ class Pipeline:
             self.pipeline = {"text": self.pipeline} | chains[i]
         # print pipeline
-        print("Initialized Text2KG pipeline:")
         print(str(self))
     def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
-        """Run Text2KG pipeline on passed text.
         Args:
             *texts (str): The text inputs
@@ -126,7 +126,7 @@ class Pipeline:
             knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
                 indexed by batch
         """
-        print("Running Text2KG pipeline:")
         # split text into batches
         text_batches = list(partition_sentences(
             sentences=sent_tokenize(text),
@@ -150,7 +150,7 @@ class Pipeline:
     def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
-        """Text2KG post-processing."""
         print("Cleaning knowledge graph components...", end=' ')
         drop_list = []

 class Module:
+    """Words2Wisdom module class."""
     def __init__(self, name: str) -> None:
         self.name = name
         self.parser = self.get_parser()
 class Pipeline:
+    """Words2Wisdom pipeline class."""
     def __init__(self, config: Config):
     def __repr__(self) -> str:
+        return f"Words2Wisdom(\n\tconfig.pipeline={self.config.pipeline}\n\tconfig.llm={self.config.llm}\n)"
     def __str__(self) -> str:
     def initialize(self, config: Config):
+        """Initialize Words2Wisdom pipeline from config."""
         # validate preprocess
         preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
             self.pipeline = {"text": self.pipeline} | chains[i]
         # print pipeline
+        print("Initialized Words2Wisdom pipeline:")
         print(str(self))
     def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
+        """Run Words2Wisdom pipeline on passed text.
         Args:
             *texts (str): The text inputs
             knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
                 indexed by batch
         """
+        print("Running Words2Wisdom pipeline:")
         # split text into batches
         text_batches = list(partition_sentences(
             sentences=sent_tokenize(text),
     def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
+        """Words2Wisdom post-processing."""
         print("Cleaning knowledge graph components...", end=' ')
         drop_list = []

src/words2wisdom/validate.py CHANGED Viewed

@@ -37,7 +37,7 @@ def format_system_prompt():
             + " "
             + (question["additional"] + " " if question["additional"] else "")
             + "("
-            + ";".join(question["options"])
             + ")\n"
         )
         return formatted

             + " "
             + (question["additional"] + " " if question["additional"] else "")
             + "("
+            + "; ".join(question["options"])
             + ")\n"
         )
         return formatted

writeup/words2wisdom_poster.pdf ADDED Viewed

Binary file (426 kB). View file