Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,10 @@ import csv
|
|
8 |
import json
|
9 |
import os
|
10 |
import torch
|
|
|
|
|
|
|
|
|
11 |
|
12 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
13 |
|
@@ -15,7 +19,7 @@ huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
|
15 |
if huggingface_token:
|
16 |
login(token=huggingface_token)
|
17 |
|
18 |
-
# Configuraci贸n del modelo
|
19 |
@st.cache_resource
|
20 |
def load_llm():
|
21 |
llm = HuggingFaceEndpoint(
|
@@ -39,6 +43,32 @@ classification_model, classification_tokenizer = load_classification_model()
|
|
39 |
|
40 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def classify_text(text):
|
43 |
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
44 |
classification_model.eval()
|
@@ -47,7 +77,7 @@ def classify_text(text):
|
|
47 |
logits = outputs.logits
|
48 |
predicted_class_id = logits.argmax(dim=-1).item()
|
49 |
predicted_label = id2label[predicted_class_id]
|
50 |
-
return
|
51 |
|
52 |
def translate(text, target_language):
|
53 |
template = '''
|
@@ -105,61 +135,62 @@ def handle_uploaded_file(uploaded_file):
|
|
105 |
return str(e)
|
106 |
|
107 |
def main():
|
108 |
-
st.title("LexAIcon: Traduce, resume y explica textos legales")
|
109 |
-
|
110 |
-
st.header("Puedes conversar con este chatbot basado en Mistral7B-Instruct y subir archivos para que ser traducidos resumidos o explicados.")
|
111 |
-
|
112 |
st.image("./icon.jpg", width=100)
|
|
|
|
|
113 |
|
|
|
|
|
114 |
|
115 |
-
|
116 |
-
st.
|
|
|
117 |
|
118 |
-
|
119 |
-
|
120 |
|
121 |
-
|
122 |
-
|
|
|
123 |
|
124 |
-
|
125 |
-
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
131 |
-
|
132 |
|
133 |
-
|
134 |
-
uploaded_files = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True)
|
135 |
-
|
136 |
-
if st.button("Enviar"):
|
137 |
-
if user_input:
|
138 |
-
response = llm_engine_hf.invoke(user_input)
|
139 |
-
st.session_state.generated.append({"user": user_input, "bot": response['generated_text']})
|
140 |
-
|
141 |
-
if st.button("Ejecutar"):
|
142 |
-
if uploaded_files:
|
143 |
for uploaded_file in uploaded_files:
|
144 |
file_content = handle_uploaded_file(uploaded_file)
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
if __name__ == "__main__":
|
165 |
main()
|
|
|
8 |
import json
|
9 |
import os
|
10 |
import torch
|
11 |
+
from langchain.document_loaders import JSONLoader
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
14 |
+
from langchain.vectorstores import FAISS
|
15 |
|
16 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
17 |
|
|
|
19 |
if huggingface_token:
|
20 |
login(token=huggingface_token)
|
21 |
|
22 |
+
# Configuraci贸n del modelo de generaci贸n de texto
|
23 |
@st.cache_resource
|
24 |
def load_llm():
|
25 |
llm = HuggingFaceEndpoint(
|
|
|
43 |
|
44 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
45 |
|
46 |
+
# Cargar documentos JSON para cada categor铆a
|
47 |
+
@st.cache_resource
|
48 |
+
def load_json_documents():
|
49 |
+
documents = {}
|
50 |
+
categories = ["multas", "politicas_de_privacidad", "contratos", "denuncias", "otros"]
|
51 |
+
for category in categories:
|
52 |
+
with open(f"./{category}.json", "r", encoding="utf-8") as f:
|
53 |
+
data = json.load(f)["questions_and_answers"]
|
54 |
+
documents[category] = [entry["question"] + " " + entry["answer"] for entry in data]
|
55 |
+
return documents
|
56 |
+
|
57 |
+
json_documents = load_json_documents()
|
58 |
+
|
59 |
+
# Configuraci贸n de Embeddings y Vector Stores
|
60 |
+
@st.cache_resource
|
61 |
+
def create_vector_store():
|
62 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
|
63 |
+
vector_stores = {}
|
64 |
+
for category, docs in json_documents.items():
|
65 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
66 |
+
split_docs = [doc for doc in text_splitter.split_text(docs)]
|
67 |
+
vector_stores[category] = FAISS.from_texts(split_docs, embeddings)
|
68 |
+
return vector_stores
|
69 |
+
|
70 |
+
vector_stores = create_vector_store()
|
71 |
+
|
72 |
def classify_text(text):
|
73 |
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
74 |
classification_model.eval()
|
|
|
77 |
logits = outputs.logits
|
78 |
predicted_class_id = logits.argmax(dim=-1).item()
|
79 |
predicted_label = id2label[predicted_class_id]
|
80 |
+
return predicted_label
|
81 |
|
82 |
def translate(text, target_language):
|
83 |
template = '''
|
|
|
135 |
return str(e)
|
136 |
|
137 |
def main():
|
|
|
|
|
|
|
|
|
138 |
st.image("./icon.jpg", width=100)
|
139 |
+
st.title("LexAIcon")
|
140 |
+
st.write("Puedes conversar con este chatbot basado en Mistral7B-Instruct y subir archivos para que el chatbot los procese.")
|
141 |
|
142 |
+
if "messages" not in st.session_state:
|
143 |
+
st.session_state["messages"] = [{"role": "assistant", "content": "驴C贸mo puedo ayudarte?"}]
|
144 |
|
145 |
+
with st.sidebar:
|
146 |
+
st.text_input("HuggingFace Token", value=huggingface_token, type="password", key="huggingface_token")
|
147 |
+
st.caption("[Consigue un HuggingFace Token](https://huggingface.co/settings/tokens)")
|
148 |
|
149 |
+
for msg in st.session_state.messages:
|
150 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
151 |
|
152 |
+
if prompt := st.chat_input():
|
153 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
154 |
+
st.chat_message("user").write(prompt)
|
155 |
|
156 |
+
operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
|
157 |
+
target_language = None
|
158 |
+
summary_length = None
|
159 |
|
160 |
+
if operation == "Traducir":
|
161 |
+
target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
|
162 |
|
163 |
+
if operation == "Resumir":
|
164 |
+
summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
|
165 |
|
166 |
+
if uploaded_files := st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
for uploaded_file in uploaded_files:
|
168 |
file_content = handle_uploaded_file(uploaded_file)
|
169 |
+
classification = classify_text(file_content)
|
170 |
+
vector_store = vector_stores[classification]
|
171 |
+
search_docs = vector_store.similarity_search(prompt)
|
172 |
+
context = " ".join([doc.page_content for doc in search_docs])
|
173 |
+
prompt_with_context = f"Contexto: {context}\n\nPregunta: {prompt}"
|
174 |
+
response = llm_engine_hf.invoke(prompt_with_context)
|
175 |
+
msg = response.content
|
176 |
+
|
177 |
+
elif operation == "Resumir":
|
178 |
+
if summary_length == "corto":
|
179 |
+
length = "de aproximadamente 50 palabras"
|
180 |
+
elif summary_length == "medio":
|
181 |
+
length = "de aproximadamente 100 palabras"
|
182 |
+
elif summary_length == "largo":
|
183 |
+
length = "de aproximadamente 500 palabras"
|
184 |
+
msg = summarize(prompt, length)
|
185 |
+
|
186 |
+
elif operation == "Traducir":
|
187 |
+
msg = translate(prompt, target_language)
|
188 |
+
|
189 |
+
else:
|
190 |
+
msg = llm_engine_hf.invoke(prompt).content
|
191 |
+
|
192 |
+
st.session_state.messages.append({"role": "assistant", "content": msg})
|
193 |
+
st.chat_message("assistant").write(msg)
|
194 |
|
195 |
if __name__ == "__main__":
|
196 |
main()
|