import io import os import sys from zipfile import ZipFile sys.path.append(os.path.dirname(os.path.dirname(__file__))) import pandas as pd import streamlit as st import streamlit.components.v1 as st_components from pandas import DataFrame from pyvis.network import Network from words2wisdom import CONFIG_DIR from words2wisdom.config import Config as W2WConfig from words2wisdom.pipeline import Pipeline def create_graph(df: DataFrame): graph = Network(directed=True) entities = pd.concat([df.subject, df.object]).unique() graph.add_nodes(entities, label=entities, title=entities) df_iterable = ( df.drop_duplicates( subset=["subject", "relation", "object"] ) .iterrows() ) for _, row in df_iterable: graph.add_edge(row.subject, row.object, label=row.relation) graph.save_graph("/tmp/graph.html") HtmlFile = open("/tmp/graph.html") return st_components.html(HtmlFile.read(), height=625) @st.cache_data def create_zip_bytes(file_contents): buffer = io.BytesIO() with ZipFile(buffer, 'w') as zip_file: for filename, content in file_contents.items(): zip_file.writestr(filename, content) return buffer.getvalue() st.set_page_config(page_title="Words2Wisdom", page_icon="📖") st.title("📖 Words2Wisdom") st.write("Generate knowledge graphs from unstructured text using GPT.") # parameters with st.sidebar: st.title("Parameters") st.write("The API Key is required. Feel free to customize the other parameters, if you'd like!") openai_api_key = st.text_input( label="🔐 **OpenAI API Key**", type="password", help="Learn how to get your own [here](https://platform.openai.com/docs/api-reference/authentication)." ) st.divider() with st.expander("🚰 **Pipeline parameters**"): formatter = lambda x: x.replace("_", " ").title() words_per_batch = st.number_input( label="Words per Batch", min_value=0, max_value=200, value=150, help="Batch text into paragraphs containing at least N words, if possible." ) preprocess = st.selectbox( label="Preprocess", options=("None", "clause_deconstruction"), index=1, format_func=formatter, help="Method for text simplification." ) extraction = st.selectbox( label="Generation", options=("triplet_extraction",), index=0, format_func=formatter, help="Method for KG generation." ) with st.expander("🤖 **LLM parameters**"): model = st.selectbox( label="Model", options=("gpt-3.5-turbo",), index=0, help="ID of the model to use." ) temperature = st.slider( label="Temperature", min_value=0.0, max_value=2.0, value=1.0, step=0.1, format="%.1f", help=( "What sampling temperature to use." " Higher values will make the output more random;" " lower values will make it more focused/deterministic." ) ) # API Key warning if not openai_api_key.startswith('sk-'): st.warning('Please enter your OpenAI API key.', icon='⚠️') # tabs tab1, tab2 = st.tabs(["Input Text", "File Upload"]) # text input tab with tab1: text1 = st.text_area(label="Enter text:") submitted1 = tab1.button(label="Generate!", use_container_width=True) # file upload tab with tab2: file2 = tab2.file_uploader(label="Upload text file:", type="txt") submitted2 = tab2.button(key="filebtn", label="Generate!", use_container_width=True) # w2w config w2w_config = W2WConfig.read_ini(os.path.join(CONFIG_DIR, "default_config.ini")) w2w_config.pipeline = { "words_per_batch": words_per_batch, "preprocess": [] if preprocess == "None" else [preprocess], "extraction": extraction } w2w_config.llm["openai_api_key"] = openai_api_key # main logic if (submitted1 or submitted2) and openai_api_key.startswith("sk-"): with st.status("Generating knowledge graph..."): st.write("Initializing pipeline...") pipe = Pipeline(w2w_config) st.write("Executing pipeline...") if submitted1: text = text1 elif submitted2: text = file2.read().decode() text_batches, knowledge_graph = pipe.run(text) st.write("Complete.") st.divider() kg_viz = create_graph(knowledge_graph) st.error("**Warning:** The page will refresh when you download the data!", icon="🚨") download = st.download_button( label="Download data", data=create_zip_bytes({ "text_batches.csv": ( DataFrame(text_batches, columns=["text"]) .to_csv(index_label="batch_id") ), "kg.csv": knowledge_graph.to_csv(index=False), "config.ini": pipe.serialize() }), file_name="output.zip", use_container_width=True, type="primary" )