Spaces:

Cicciokr
/

AIGenMaskedFillLatinText

Sleeping

App Files Files Community

Cicciokr commited on Feb 13

Commit

82e05bc

verified ·

1 Parent(s): 6553df3

Create app.py

Browse files

Files changed (1) hide show

app.py +115 -0

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import streamlit as st
+from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
+from cltk.data.fetch import FetchCorpus
+import builtins
+import os
+import json
+DATA_FILE = "data.json"
+def load_data():
+    """Carica i dati salvati (token e frasi) dal file JSON."""
+    if os.path.exists(DATA_FILE):
+        with open(DATA_FILE, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return {"tokens": [], "phrases": {}}
+def save_data(data):
+    """Salva i dati (token e frasi) nel file JSON."""
+    with open(DATA_FILE, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4)
+data = load_data()
+def save_token_and_phrase(token, phrase):
+    if phrase not in data["phrases"]:
+        data["phrases"][phrase] = token
+        save_data(data)
+def get_valid_predictions(sentence, max_attempts=3, top_k=5):
+    """Verifica se la frase è già salvata e usa il token corrispondente."""
+    if sentence in data["phrases"]:
+        return [{"token_str": data["phrases"][sentence], "score": 1.0, "sequence": sentence.replace("[MASK]", data["phrases"][sentence])}]
+    attempt = 0
+    filtered_predictions = []
+    while attempt < max_attempts:
+        predictions = fill_mask_roberta(sentence, top_k=top_k)
+        filtered_predictions = [
+            pred for pred in predictions if pred["token_str"] not in punctuation_marks
+        ]
+        if filtered_predictions:
+            break
+        attempt += 1
+    return filtered_predictions
+# UI per l'inserimento del token e delle frasi
+st.sidebar.header("Gestione Token e Frasi")
+token_input = st.sidebar.text_input("Inserisci il token:")
+phrase_input = st.sidebar.text_area("Inserisci la frase:")
+if st.sidebar.button("Salva Token e Frase"):
+    if token_input and phrase_input:
+        save_token_and_phrase(token_input, phrase_input)
+        st.sidebar.success("Token e frase salvati con successo!")
+    else:
+        st.sidebar.warning("Inserisci sia un token che una frase validi.")
+existing_phrases = data.get("phrases", {})
+st.sidebar.subheader("Frasi salvate:")
+st.sidebar.write("\n".join(existing_phrases.keys()) if existing_phrases else "Nessuna frase salvata.")
+_original_input = builtins.input
+def _always_yes(prompt=""):
+    print(prompt, "Y")  # per far vedere a log che abbiamo risposto 'Y'
+    return "Y"
+builtins.input = _always_yes
+corpus_downloader = FetchCorpus(language="lat")
+corpus_downloader.import_corpus("lat_models_cltk")
+try:
+    from cltk import NLP
+    nlp_lat = NLP(language="lat")
+except ImportError:
+    nlp_lat = None
+if "input_text_value" not in st.session_state:
+    st.session_state["input_text_value"] = "Lorem ipsum dolor sit amet, [MASK] adipiscing elit."
+tokenizer_roberta = AutoTokenizer.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
+model_roberta = AutoModelForMaskedLM.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
+fill_mask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=tokenizer_roberta)
+punctuation_marks = {".", ",", ";", ":", "!", "?"}
+input_text = st.text_area(
+    label="Testo:",
+    height=150,
+    key="input_text_value"
+)
+if input_text:
+    input_text_roberta = input_text.replace("[MASK]", "<mask>")
+    predictions_roberta = get_valid_predictions(input_text_roberta)
+    st.subheader("Risultati delle previsioni:")
+    for pred in predictions_roberta:
+        st.write(f" Token: {pred['token_str']}")
+        st.write(f" Probabilità: {pred['score']:.4f}")
+        st.write(f" Sequence: {pred['sequence']}")
+        st.write("---")
+    if nlp_lat is not None:
+        st.subheader("Analisi Morfologica con CLTK")
+        for pred in predictions_roberta:
+            doc = nlp_lat(pred['token_str'])
+            st.write(f"Frase: {pred['token_str']}")
+            for w in doc.words:
+                st.write(
+                    f"- **Token**: {w.string}\n"
+                    f"  - Lemma: {w.lemma}\n"
+                    f"  - UPOS: {w.upos}\n"
+                    f"  - Morph: {w.features}\n"
+                )
+            st.write("---")
+    else:
+        st.warning("CLTK non installato. Esegui 'pip install cltk' per abilitare l'analisi.")