Spaces:
Running
Running
from typing import Optional | |
import spacy | |
from spacy import displacy | |
from spacy.language import Language | |
import streamlit as st | |
from spacy_streamlit import visualize_parser | |
from spacy_streamlit import visualize_tokens | |
import base64 | |
from PIL import Image | |
#import pandas | |
st.set_page_config(layout="wide") | |
st.image("logo.png", use_column_width=False, width=150) | |
st.title("Ancient Greek Syntax Analyzer") | |
st.markdown("Welcome to our analyzer. Here you can parse the parts of speech (POS) and the syntactic relationships of any ancient Greek sentence. This analysis is done by our language models trained with transformers and the NLP library spaCy. Below, you can choose which model do you want to use (each model may produce a different analysis). Documentation about the linguistic terms used by our models to annotate your sentences can be found here. If you have any questions, please contact us at [email protected]") | |
st.header("Select a model:") | |
spacy_model = st.selectbox("Model", ["grc_proiel_trf","grc_proiel_lg","grc_proiel_sm","grc_perseus_trf"]) | |
st.header("Enter text:") | |
text = st.text_area("Greek text","φύσει μὲν οὖν αἴσθησιν ἔχοντα γίγνεται τὰ ζῷα, ἐκ δὲ ταύτης τοῖς μὲν αὐτῶν οὐκ ἐγγίγνεται μνήμη, τοῖς δʼ ἐγγίγνεται.") | |
#config = {"punct_chars": [".", ";", "·"]} | |
nlp = spacy.load(spacy_model, exclude="senter") | |
#nlp.add_pipe("sentencizer", config=config, before="parser") | |
# Get the new pipeline order | |
doc = nlp(text) | |
def get_html(html: str): | |
"""Convert HTML so it can be rendered.""" | |
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>""" | |
# Newlines seem to mess with the rendering | |
html = html.replace("\n", " ") | |
return WRAPPER.format(html) | |
def get_svg(svg: str, style: str = "", wrap: bool = True): | |
"""Convert an SVG to a base64-encoded image.""" | |
b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8") | |
html = f'<img src="data:image/svg+xml;base64,{b64}" style="{style}"/>' | |
return get_html(html) if wrap else html | |
def visualize_parser( | |
doc: spacy.tokens.Doc, | |
*, | |
title: Optional[str] = "Dependency parse & part of speech:", | |
key: Optional[str] = None, | |
) -> None: | |
"""Visualizer for dependency parses.""" | |
if title: | |
st.header(title) | |
cols = st.columns(4) | |
split_sents = cols[0].checkbox( | |
"Split sentences", value=True, key=f"{key}_parser_split_sents" | |
) | |
options = { | |
"collapse_punct": cols[1].checkbox( | |
"Collapse punct", value=True, key=f"{key}_parser_collapse_punct" | |
), | |
"compact": cols[3].checkbox("Compact mode", value=True, key=f"{key}_parser_compact"), | |
} | |
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] | |
for sent in docs: | |
html = displacy.render(sent, options=options, style="dep") | |
# Double newlines seem to mess with the rendering | |
html = html.replace("\n\n", "\n") | |
if split_sents and len(docs) > 1: | |
st.markdown(f"> {sent.text}") | |
st.write(get_svg(html), unsafe_allow_html=True) | |
visualize_parser(doc) | |
#pd.set_option('display.max_colwidth', None) | |
visualize_tokens(doc, attrs=["text", "lemma_", "pos_", "dep_"], title="Table view:", key="tokens") | |