Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import AutoTokenizer
|
3 |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
4 |
from huggingface_hub import login
|
5 |
from PyPDF2 import PdfReader
|
@@ -7,6 +7,7 @@ from docx import Document
|
|
7 |
import csv
|
8 |
import json
|
9 |
import os
|
|
|
10 |
|
11 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
12 |
|
@@ -27,17 +28,57 @@ def load_llm():
|
|
27 |
|
28 |
llm_engine_hf, tokenizer = load_llm()
|
29 |
|
30 |
-
|
31 |
-
st.
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
def
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
def handle_uploaded_file(uploaded_file):
|
43 |
try:
|
@@ -65,9 +106,23 @@ def handle_uploaded_file(uploaded_file):
|
|
65 |
except Exception as e:
|
66 |
return str(e)
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# Entrada del usuario
|
69 |
user_input = st.text_input("T煤: ", "")
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
# Manejo de archivos subidos
|
72 |
uploaded_files = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True)
|
73 |
|
@@ -76,13 +131,28 @@ if st.button("Enviar"):
|
|
76 |
response = generate_response(user_input)
|
77 |
st.session_state.generated.append({"user": user_input, "bot": response})
|
78 |
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
for chat in st.session_state["generated"]:
|
81 |
st.write(f"T煤: {chat['user']}")
|
82 |
-
st.write(f"Chatbot: {chat['bot']}")
|
83 |
-
|
84 |
-
if uploaded_files:
|
85 |
-
for uploaded_file in uploaded_files:
|
86 |
-
st.write(f"Archivo subido: {uploaded_file.name}")
|
87 |
-
file_content = handle_uploaded_file(uploaded_file)
|
88 |
-
st.write(file_content)
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
4 |
from huggingface_hub import login
|
5 |
from PyPDF2 import PdfReader
|
|
|
7 |
import csv
|
8 |
import json
|
9 |
import os
|
10 |
+
import torch
|
11 |
|
12 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
13 |
|
|
|
28 |
|
29 |
llm_engine_hf, tokenizer = load_llm()
|
30 |
|
31 |
+
# Configuraci贸n del modelo de clasificaci贸n
|
32 |
+
@st.cache_resource
|
33 |
+
def load_classification_model():
|
34 |
+
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
35 |
+
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
36 |
+
return model, tokenizer
|
37 |
|
38 |
+
classification_model, classification_tokenizer = load_classification_model()
|
39 |
+
|
40 |
+
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
41 |
+
|
42 |
+
def classify_text(text):
|
43 |
+
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
44 |
+
classification_model.eval()
|
45 |
+
with torch.no_grad():
|
46 |
+
outputs = classification_model(**inputs)
|
47 |
+
logits = outputs.logits
|
48 |
+
predicted_class_id = logits.argmax(dim=-1).item()
|
49 |
+
predicted_label = id2label[predicted_class_id]
|
50 |
+
return f"Clasificaci贸n: {predicted_label}\n\nDocumento:\n{text}"
|
51 |
|
52 |
+
def translate(text, target_language):
|
53 |
+
template = '''
|
54 |
+
Por favor, traduzca el siguiente documento al {LANGUAGE}:
|
55 |
+
<document>
|
56 |
+
{TEXT}
|
57 |
+
</document>
|
58 |
+
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento.
|
59 |
+
'''
|
60 |
+
|
61 |
+
formatted_prompt = template.replace("{TEXT}", text).replace("{LANGUAGE}", target_language)
|
62 |
+
inputs = tokenizer(formatted_prompt, return_tensors="pt")
|
63 |
+
outputs = llm_engine_hf.invoke(formatted_prompt)
|
64 |
+
translated_text = outputs.content
|
65 |
+
|
66 |
+
return translated_text
|
67 |
+
|
68 |
+
def summarize(text, length):
|
69 |
+
template = f'''
|
70 |
+
Por favor, haga un resumen {length} del siguiente documento:
|
71 |
+
<document>
|
72 |
+
{text}
|
73 |
+
</document>
|
74 |
+
Aseg煤rese de que el resumen sea conciso y conserve el significado original del documento.
|
75 |
+
'''
|
76 |
+
|
77 |
+
inputs = tokenizer(template, return_tensors="pt")
|
78 |
+
outputs = llm_engine_hf.invoke(template)
|
79 |
+
summarized_text = outputs.content
|
80 |
+
|
81 |
+
return summarized_text
|
82 |
|
83 |
def handle_uploaded_file(uploaded_file):
|
84 |
try:
|
|
|
106 |
except Exception as e:
|
107 |
return str(e)
|
108 |
|
109 |
+
st.title("LexAIcon")
|
110 |
+
st.write("Puedes conversar con este chatbot basado en Mistral7B-Instruct y subir archivos para que el chatbot los procese.")
|
111 |
+
|
112 |
+
if "generated" not in st.session_state:
|
113 |
+
st.session_state["generated"] = []
|
114 |
+
if "past" not in st.session_state:
|
115 |
+
st.session_state["past"] = []
|
116 |
+
|
117 |
# Entrada del usuario
|
118 |
user_input = st.text_input("T煤: ", "")
|
119 |
|
120 |
+
# Opciones para la traducci贸n
|
121 |
+
target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
|
122 |
+
|
123 |
+
# Opciones para el resumen
|
124 |
+
summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
|
125 |
+
|
126 |
# Manejo de archivos subidos
|
127 |
uploaded_files = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True)
|
128 |
|
|
|
131 |
response = generate_response(user_input)
|
132 |
st.session_state.generated.append({"user": user_input, "bot": response})
|
133 |
|
134 |
+
# Botones de Resumir, Traducir y Explicar
|
135 |
+
operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
|
136 |
+
|
137 |
+
if st.button("Ejecutar"):
|
138 |
+
if uploaded_files:
|
139 |
+
for uploaded_file in uploaded_files:
|
140 |
+
file_content = handle_uploaded_file(uploaded_file)
|
141 |
+
if operation == "Resumir":
|
142 |
+
if summary_length == "corto":
|
143 |
+
length = "de aproximadamente 50 palabras"
|
144 |
+
elif summary_length == "medio":
|
145 |
+
length = "de aproximadamente 100 palabras"
|
146 |
+
elif summary_length == "largo":
|
147 |
+
length = "de aproximadamente 500 palabras"
|
148 |
+
result = summarize(file_content, length)
|
149 |
+
elif operation == "Traducir":
|
150 |
+
result = translate(file_content, target_language)
|
151 |
+
elif operation == "Explicar":
|
152 |
+
result = classify_text(file_content)
|
153 |
+
st.write(result)
|
154 |
+
|
155 |
+
if st.session_state.get("generated"):
|
156 |
for chat in st.session_state["generated"]:
|
157 |
st.write(f"T煤: {chat['user']}")
|
158 |
+
st.write(f"Chatbot: {chat['bot']}")
|
|
|
|
|
|
|
|
|
|
|
|