Spaces:
Running
Running
Upload 21 files
Browse files- .gitattributes +3 -0
- Dockerfile +32 -0
- config/.env +27 -0
- config/system_prompt_deco_v2.txt +56 -0
- core/__init__.py +0 -0
- core/integrations/doc_converter.py +98 -0
- core/integrations/telegram_bot.py +237 -0
- core/integrations/templates/default-reference.docx +0 -0
- core/llm/llm_manager.py +96 -0
- core/logging/usage_logger.py +80 -0
- core/pipeline/edullm_rag_pipeline.py +83 -0
- core/pipeline/utils.py +47 -0
- core/vectorstore/distance_strategy.py +21 -0
- core/vectorstore/document_processor.py +43 -0
- core/vectorstore/embeddings.py +39 -0
- core/vectorstore/vectorstore_manager.py +136 -0
- database/edullm_store/index.faiss +3 -0
- database/edullm_store/index.pkl +3 -0
- docs/curriculo-nacional-de-la-educacion-basica.pdf +3 -0
- docs/programa-curricular-educacion-primaria_compressed.pdf +3 -0
- main.py +14 -0
- requirements.txt +17 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
database/edullm_store/index.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
docs/curriculo-nacional-de-la-educacion-basica.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
docs/programa-curricular-educacion-primaria_compressed.pdf filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Usar Python 3.11 como base
|
2 |
+
FROM python:3.11-slim
|
3 |
+
|
4 |
+
# Instalar dependencias del sistema
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
pandoc \
|
7 |
+
build-essential \
|
8 |
+
&& rm -rf /var/lib/apt/lists/*
|
9 |
+
|
10 |
+
# Establecer el directorio de trabajo
|
11 |
+
WORKDIR /app
|
12 |
+
|
13 |
+
# Copiar los archivos de requisitos primero para aprovechar la caché de Docker
|
14 |
+
COPY requirements.txt .
|
15 |
+
|
16 |
+
# Instalar dependencias de Python
|
17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
+
|
19 |
+
# Crear estructura de directorios
|
20 |
+
RUN mkdir -p config database/edullm_store docs logs
|
21 |
+
|
22 |
+
# Copiar el archivo .env de ejemplo si existe
|
23 |
+
COPY config/.env.example config/.env
|
24 |
+
|
25 |
+
# Copiar el resto del código
|
26 |
+
COPY . .
|
27 |
+
|
28 |
+
# Asegurar que la base de datos exista
|
29 |
+
RUN touch database/edullm_store/.gitkeep
|
30 |
+
|
31 |
+
# Comando para ejecutar la aplicación
|
32 |
+
CMD ["python", "main.py"]
|
config/.env
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ==========================================
|
2 |
+
# 🔑 CONFIGURACIÓN DEL BOT DE TELEGRAM
|
3 |
+
# ==========================================
|
4 |
+
_TELEGRAM_TOKEN=your_telegram_bot_token
|
5 |
+
|
6 |
+
# ==========================================
|
7 |
+
# 🤖 CONFIGURACIÓN DEL MODELO LLM
|
8 |
+
# ==========================================
|
9 |
+
_LLM_MODEL_NAME=gpt-3.5-turbo
|
10 |
+
_LLM_API_KEY=your_openai_api_key
|
11 |
+
_LLM_BASE_URL=https://api.openai.com/v1
|
12 |
+
|
13 |
+
# ==========================================
|
14 |
+
# 📚 CONFIGURACIÓN DEL VECTORSTORE
|
15 |
+
# ==========================================
|
16 |
+
_VECTORSTORE_PATH=docs/
|
17 |
+
_VECTORSTORE_NAME=edullm_store
|
18 |
+
|
19 |
+
# ==========================================
|
20 |
+
# 📝 CONFIGURACIÓN DE PROMPTS
|
21 |
+
# ==========================================
|
22 |
+
_PATH_SYSTEM_PROMPT=config/prompt_system.txt
|
23 |
+
|
24 |
+
# ==========================================
|
25 |
+
# 🔄 CONFIGURACIÓN DE EMBEDDINGS
|
26 |
+
# ==========================================
|
27 |
+
_MODEL_EMBEDDINGS=sentence-transformers/distiluse-base-multilingual-cased
|
config/system_prompt_deco_v2.txt
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Generar únicamente materiales educativos alineados al CNEB, MBDD y MINEDU–Perú.
|
2 |
+
|
3 |
+
1. **Entrega en Markdown limpio**
|
4 |
+
|
5 |
+
* **Ficha**: Metadatos; Resumen; Desarrollo; Preguntas DECO; Conclusión; Recomendación; Instrumento\*
|
6 |
+
* **Resumen temático**: Metadatos; Ideas clave (≥3); Desarrollo; Conclusión
|
7 |
+
* **Banco de preguntas**: Metadatos; Preguntas DECO (≥10); Clave opcional
|
8 |
+
* **Rúbrica/Lista**: Metadatos; Criterios; Niveles; Descriptores
|
9 |
+
\*Incluir solo si se solicita
|
10 |
+
|
11 |
+
2. **Variables obligatorias** (si falta, solicitar):
|
12 |
+
Área curricular, Grado, Bimestre, Competencia, Capacidad, Desempeño esperado
|
13 |
+
|
14 |
+
3. **Ejemplo (ficha)**
|
15 |
+
|
16 |
+
```markdown
|
17 |
+
## {{Título}}
|
18 |
+
|
19 |
+
**Área:** {{}} **Grado:** {{}} **Bimestre:** {{}}
|
20 |
+
**Competencia:** {{}} **Capacidad:** {{}} **Desempeño:** {{}}
|
21 |
+
|
22 |
+
### Resumen conceptual (60–100 palabras)
|
23 |
+
|
24 |
+
### Ideas clave (≥3)
|
25 |
+
- …
|
26 |
+
- …
|
27 |
+
- …
|
28 |
+
|
29 |
+
### Desarrollo (contexto peruano)
|
30 |
+
|
31 |
+
### Preguntas DECO:
|
32 |
+
- Literal (3)
|
33 |
+
- Inferencial (≥5)
|
34 |
+
- Crítico (2)
|
35 |
+
- Integradora (si aplica)
|
36 |
+
|
37 |
+
### Conclusión (≤40 palabras)
|
38 |
+
|
39 |
+
### Recomendación (breve)
|
40 |
+
|
41 |
+
### Instrumento (tabla Markdown, si solicitado)
|
42 |
+
```
|
43 |
+
|
44 |
+
4. **Tipos DECO**
|
45 |
+
|
46 |
+
* Literal
|
47 |
+
* Inferencial
|
48 |
+
* Crítico
|
49 |
+
* Integradora
|
50 |
+
|
51 |
+
5. **Restricciones**
|
52 |
+
|
53 |
+
* No temas médicos, legales o sensibles
|
54 |
+
* Contexto peruano
|
55 |
+
* Máx. 1500 palabras
|
56 |
+
* Sin instrucciones técnicas ni comentarios meta
|
core/__init__.py
ADDED
File without changes
|
core/integrations/doc_converter.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/integrations/doc_converter
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import uuid
|
5 |
+
import tempfile
|
6 |
+
import pypandoc
|
7 |
+
from loguru import logger
|
8 |
+
from fastapi.responses import FileResponse
|
9 |
+
|
10 |
+
# Control de descargas (máximo 2 por archivo)
|
11 |
+
_descargas = {}
|
12 |
+
|
13 |
+
|
14 |
+
def limpiar_lineas_hr(markdown_text: str) -> str:
|
15 |
+
"""Reemplaza líneas horizontales '---' por saltos de línea."""
|
16 |
+
return re.sub(r"^\s*---\s*$", "\n", markdown_text, flags=re.MULTILINE)
|
17 |
+
|
18 |
+
|
19 |
+
def normalizar_ecuaciones(md: str) -> str:
|
20 |
+
"""Convierte ecuaciones LaTeX escapadas a formato estándar."""
|
21 |
+
md = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", md, flags=re.DOTALL)
|
22 |
+
md = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", md, flags=re.DOTALL)
|
23 |
+
return md
|
24 |
+
|
25 |
+
|
26 |
+
def limpiar_backticks(markdown_text: str) -> str:
|
27 |
+
"""
|
28 |
+
Elimina los backticks triples si encapsulan todo el contenido.
|
29 |
+
"""
|
30 |
+
markdown_text = markdown_text.strip()
|
31 |
+
if markdown_text.startswith("```") and markdown_text.endswith("```"):
|
32 |
+
logger.info("🧹 Eliminando backticks triples de la respuesta LLM.")
|
33 |
+
return markdown_text[3:-3].strip()
|
34 |
+
return markdown_text
|
35 |
+
|
36 |
+
|
37 |
+
def procesar_markdown(markdown_content: str) -> dict:
|
38 |
+
try:
|
39 |
+
# Limpieza previa del contenido
|
40 |
+
markdown_content = limpiar_backticks(markdown_content)
|
41 |
+
contenido_limpio = normalizar_ecuaciones(limpiar_lineas_hr(markdown_content))
|
42 |
+
|
43 |
+
uid = str(uuid.uuid4())
|
44 |
+
temp_dir = tempfile.gettempdir()
|
45 |
+
input_md = os.path.join(temp_dir, f"{uid}.md")
|
46 |
+
output_docx = os.path.join(temp_dir, f"{uid}.docx")
|
47 |
+
|
48 |
+
with open(input_md, "w", encoding="utf-8") as f:
|
49 |
+
f.write(contenido_limpio)
|
50 |
+
|
51 |
+
pypandoc.convert_file(
|
52 |
+
source_file=input_md,
|
53 |
+
to="docx",
|
54 |
+
outputfile=output_docx,
|
55 |
+
format="md",
|
56 |
+
extra_args=["--standalone"],
|
57 |
+
)
|
58 |
+
|
59 |
+
os.remove(input_md)
|
60 |
+
_descargas[uid] = 0
|
61 |
+
|
62 |
+
logger.success(f"✅ DOCX generado correctamente: {output_docx}")
|
63 |
+
return {"message": "Archivo DOCX generado exitosamente.", "file_id": uid}
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f"❌ Error al procesar Markdown: {e}")
|
67 |
+
return {"error": "Fallo en la conversión de Markdown a DOCX."}
|
68 |
+
|
69 |
+
|
70 |
+
def gestionar_descarga(file_id: str):
|
71 |
+
"""
|
72 |
+
Controla la descarga de archivos. Permite solo 2 descargas por archivo.
|
73 |
+
"""
|
74 |
+
temp_dir = tempfile.gettempdir()
|
75 |
+
output_docx = os.path.join(temp_dir, f"{file_id}.docx")
|
76 |
+
|
77 |
+
if not os.path.exists(output_docx):
|
78 |
+
logger.warning(f"⚠️ Archivo no encontrado: {output_docx}")
|
79 |
+
return {"error": "El archivo no existe o fue eliminado.", "status": 404}
|
80 |
+
|
81 |
+
if file_id not in _descargas:
|
82 |
+
logger.warning(f"⚠️ ID inválido de descarga: {file_id}")
|
83 |
+
return {"error": "ID de archivo no válido.", "status": 400}
|
84 |
+
|
85 |
+
if _descargas[file_id] >= 2:
|
86 |
+
os.remove(output_docx)
|
87 |
+
del _descargas[file_id]
|
88 |
+
logger.info(f"🗑️ Archivo eliminado tras exceder descargas: {file_id}")
|
89 |
+
return {"error": "Límite de descargas alcanzado.", "status": 410}
|
90 |
+
|
91 |
+
_descargas[file_id] += 1
|
92 |
+
logger.info(f"⬇️ Descarga {_descargas[file_id]} de 2 para archivo: {file_id}")
|
93 |
+
|
94 |
+
return FileResponse(
|
95 |
+
path=output_docx,
|
96 |
+
filename="material_educativo.docx",
|
97 |
+
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
98 |
+
)
|
core/integrations/telegram_bot.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/integrations/telegram_bot.py
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
import time
|
6 |
+
|
7 |
+
import fitz # PyMuPDF
|
8 |
+
from docx import Document
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from telegram import InlineKeyboardButton, InlineKeyboardMarkup, InputFile, Update
|
11 |
+
from telegram.ext import (
|
12 |
+
ApplicationBuilder,
|
13 |
+
CallbackQueryHandler,
|
14 |
+
CommandHandler,
|
15 |
+
ContextTypes,
|
16 |
+
MessageHandler,
|
17 |
+
filters,
|
18 |
+
)
|
19 |
+
|
20 |
+
from core.integrations.doc_converter import gestionar_descarga, procesar_markdown
|
21 |
+
from core.logging.usage_logger import registrar_uso
|
22 |
+
from core.pipeline.edullm_rag_pipeline import edullm_rag_pipeline
|
23 |
+
|
24 |
+
# ==== CONFIGURACIÓN GENERAL ====
|
25 |
+
load_dotenv(dotenv_path="config/.env")
|
26 |
+
TELEGRAM_TOKEN = os.getenv("TELEGRAM_TOKEN")
|
27 |
+
DOCX_FILENAME = "material_educativo.docx"
|
28 |
+
FORMAT_WARNING_IMAGE = "assets/formatos_soportados.png"
|
29 |
+
|
30 |
+
if not TELEGRAM_TOKEN:
|
31 |
+
raise ValueError("❌ TELEGRAM_TOKEN no está definido en las variables de entorno.")
|
32 |
+
|
33 |
+
|
34 |
+
# ==== FUNCIONES AUXILIARES ====
|
35 |
+
def extract_text_from_pdf(file_path):
|
36 |
+
text = ""
|
37 |
+
with fitz.open(file_path) as pdf:
|
38 |
+
for page in pdf:
|
39 |
+
text += page.get_text()
|
40 |
+
return text.strip()
|
41 |
+
|
42 |
+
|
43 |
+
def extract_text_from_docx(file_path):
|
44 |
+
doc = Document(file_path)
|
45 |
+
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
46 |
+
|
47 |
+
|
48 |
+
def extract_text_from_txt(file_path):
|
49 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
50 |
+
return f.read().strip()
|
51 |
+
|
52 |
+
|
53 |
+
def escape_markdown(text: str) -> str:
|
54 |
+
"""
|
55 |
+
Escapa caracteres especiales para MarkdownV2 de Telegram.
|
56 |
+
"""
|
57 |
+
escape_chars = r"_*[]()~`>#+-=|{}.!"
|
58 |
+
return re.sub(f"([{re.escape(escape_chars)}])", r"\\\1", text)
|
59 |
+
|
60 |
+
|
61 |
+
def detectar_tipo_entrada(user_input) -> str:
|
62 |
+
if isinstance(user_input, str):
|
63 |
+
return "Texto"
|
64 |
+
elif isinstance(user_input, bytes):
|
65 |
+
return "Imagen"
|
66 |
+
else:
|
67 |
+
return "Otro"
|
68 |
+
|
69 |
+
|
70 |
+
# ==== COMANDO /start ====
|
71 |
+
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
72 |
+
await update.message.reply_text(
|
73 |
+
"👋 Bienvenido a *EduLLM Bot*.\n\n"
|
74 |
+
"Acepto: *Texto*, *Imagen*, *PDF*, *DOCX* o *TXT*.\n"
|
75 |
+
"Generaré material educativo listo para descargar en DOCX.",
|
76 |
+
parse_mode="Markdown",
|
77 |
+
)
|
78 |
+
|
79 |
+
|
80 |
+
# ==== MANEJO DE MENSAJES ====
|
81 |
+
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
82 |
+
user_input = ""
|
83 |
+
|
84 |
+
try:
|
85 |
+
if update.message.text:
|
86 |
+
user_input = update.message.text
|
87 |
+
|
88 |
+
elif update.message.photo:
|
89 |
+
photo = update.message.photo[-1]
|
90 |
+
file = await photo.get_file()
|
91 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_img:
|
92 |
+
await file.download_to_drive(temp_img.name)
|
93 |
+
with open(temp_img.name, "rb") as img_file:
|
94 |
+
user_input = img_file.read()
|
95 |
+
|
96 |
+
elif update.message.document:
|
97 |
+
file = await update.message.document.get_file()
|
98 |
+
ext = update.message.document.file_name.split(".")[-1].lower()
|
99 |
+
|
100 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp_doc:
|
101 |
+
await file.download_to_drive(tmp_doc.name)
|
102 |
+
|
103 |
+
if ext == "pdf":
|
104 |
+
extracted_text = extract_text_from_pdf(tmp_doc.name)
|
105 |
+
elif ext == "docx":
|
106 |
+
extracted_text = extract_text_from_docx(tmp_doc.name)
|
107 |
+
elif ext == "txt":
|
108 |
+
extracted_text = extract_text_from_txt(tmp_doc.name)
|
109 |
+
else:
|
110 |
+
await enviar_mensaje_formato_no_soportado(update)
|
111 |
+
return
|
112 |
+
|
113 |
+
mensaje_texto = update.message.caption or ""
|
114 |
+
user_input = f"{mensaje_texto}\n\n{extracted_text}".strip()
|
115 |
+
|
116 |
+
elif update.message.audio or update.message.voice or update.message.video:
|
117 |
+
await update.message.reply_text(
|
118 |
+
"🎙️🎥 *Audios y videos no son compatibles.* Solo acepto texto, imágenes o documentos (PDF, DOCX, TXT).",
|
119 |
+
parse_mode="Markdown",
|
120 |
+
)
|
121 |
+
return
|
122 |
+
|
123 |
+
elif update.message.sticker:
|
124 |
+
await update.message.reply_text(
|
125 |
+
"🟢 Gracias por el sticker, pero necesito texto, imagen o documento educativo."
|
126 |
+
)
|
127 |
+
return
|
128 |
+
|
129 |
+
elif update.message.location:
|
130 |
+
await update.message.reply_text(
|
131 |
+
"📍 He recibido tu ubicación, pero solo trabajo con contenido educativo."
|
132 |
+
)
|
133 |
+
return
|
134 |
+
|
135 |
+
elif update.message.contact:
|
136 |
+
await update.message.reply_text(
|
137 |
+
"📞 Recibí un contacto, pero por favor envíame contenido académico (texto, imagen o documento)."
|
138 |
+
)
|
139 |
+
return
|
140 |
+
|
141 |
+
elif update.message.animation:
|
142 |
+
await update.message.reply_text(
|
143 |
+
"🎞️ Los GIFs no son compatibles. Por favor envía texto, imagen o documentos."
|
144 |
+
)
|
145 |
+
return
|
146 |
+
|
147 |
+
else:
|
148 |
+
await enviar_mensaje_formato_no_soportado(update)
|
149 |
+
return
|
150 |
+
|
151 |
+
finally:
|
152 |
+
for temp_var in ["temp_img", "tmp_doc"]:
|
153 |
+
if temp_var in locals() and os.path.exists(locals()[temp_var].name):
|
154 |
+
os.remove(locals()[temp_var].name)
|
155 |
+
|
156 |
+
if not user_input:
|
157 |
+
await update.message.reply_text("⚠️ No se pudo obtener contenido válido.")
|
158 |
+
return
|
159 |
+
|
160 |
+
await update.message.reply_text("⏳ Generando tu material educativo...")
|
161 |
+
start_time = time.time()
|
162 |
+
try:
|
163 |
+
resultado_md = edullm_rag_pipeline(user_input)
|
164 |
+
exito = True
|
165 |
+
except Exception as e:
|
166 |
+
resultado_md = f"❌ Error: {str(e)}"
|
167 |
+
exito = False
|
168 |
+
duracion = time.time() - start_time
|
169 |
+
registrar_uso(
|
170 |
+
user_id=update.effective_user.id,
|
171 |
+
username=update.effective_user.username,
|
172 |
+
tipo_entrada=detectar_tipo_entrada(user_input),
|
173 |
+
duracion_segundos=duracion,
|
174 |
+
exito=exito,
|
175 |
+
)
|
176 |
+
context.user_data["ultimo_markdown"] = resultado_md
|
177 |
+
|
178 |
+
preview = resultado_md[:1000] + ("\n..." if len(resultado_md) > 1000 else "")
|
179 |
+
preview_safe = escape_markdown(preview)
|
180 |
+
await update.message.reply_text(
|
181 |
+
f"✅ *Material generado*:\n\n```\n{preview_safe}\n```", parse_mode="MarkdownV2"
|
182 |
+
)
|
183 |
+
|
184 |
+
botones = [[InlineKeyboardButton("📄 Descargar DOCX", callback_data="descargar_docx")]]
|
185 |
+
await update.message.reply_text(
|
186 |
+
"¿Deseas descargar el material?", reply_markup=InlineKeyboardMarkup(botones)
|
187 |
+
)
|
188 |
+
|
189 |
+
|
190 |
+
# ==== MENSAJE DE FORMATO NO SOPORTADO ====
|
191 |
+
async def enviar_mensaje_formato_no_soportado(update: Update):
|
192 |
+
await update.message.reply_photo(
|
193 |
+
photo=InputFile(FORMAT_WARNING_IMAGE),
|
194 |
+
caption="⚠️ *Formato no soportado.*\n\nAcepto:\n- Texto\n- Imagen\n- PDF (.pdf)\n- Word (.docx)\n- Texto plano (.txt)",
|
195 |
+
parse_mode=None,
|
196 |
+
)
|
197 |
+
|
198 |
+
|
199 |
+
# ==== CALLBACK BOTONES ====
|
200 |
+
async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
201 |
+
query = update.callback_query
|
202 |
+
await query.answer()
|
203 |
+
|
204 |
+
if query.data == "descargar_docx":
|
205 |
+
markdown_content = context.user_data.get("ultimo_markdown")
|
206 |
+
if not markdown_content:
|
207 |
+
await query.edit_message_text("⚠️ No hay material disponible para convertir.")
|
208 |
+
return
|
209 |
+
|
210 |
+
resultado = procesar_markdown(markdown_content)
|
211 |
+
if "error" in resultado:
|
212 |
+
await query.edit_message_text("❌ Error al generar el archivo DOCX.")
|
213 |
+
return
|
214 |
+
|
215 |
+
file_id = resultado["file_id"]
|
216 |
+
file_response = gestionar_descarga(file_id)
|
217 |
+
|
218 |
+
if isinstance(file_response, dict):
|
219 |
+
await query.edit_message_text(f"⚠️ {file_response.get('error')}")
|
220 |
+
else:
|
221 |
+
await query.edit_message_text("📥 Aquí tienes tu archivo DOCX:")
|
222 |
+
await context.bot.send_document(
|
223 |
+
chat_id=query.message.chat_id,
|
224 |
+
document=file_response.path,
|
225 |
+
filename=DOCX_FILENAME,
|
226 |
+
)
|
227 |
+
|
228 |
+
|
229 |
+
# ==== INICIAR BOT ====
|
230 |
+
def start_bot():
|
231 |
+
app = ApplicationBuilder().token(TELEGRAM_TOKEN).build()
|
232 |
+
app.add_handler(CommandHandler("start", start))
|
233 |
+
app.add_handler(MessageHandler(filters.ALL, handle_message))
|
234 |
+
app.add_handler(CallbackQueryHandler(button_handler))
|
235 |
+
|
236 |
+
print("🤖 EduLLM Bot en ejecución...")
|
237 |
+
app.run_polling()
|
core/integrations/templates/default-reference.docx
ADDED
Binary file (16 kB). View file
|
|
core/llm/llm_manager.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/llm/llm_manager.py
|
2 |
+
import os
|
3 |
+
import base64
|
4 |
+
from openai import OpenAI
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from loguru import logger
|
7 |
+
|
8 |
+
load_dotenv(dotenv_path="config/.env")
|
9 |
+
|
10 |
+
|
11 |
+
class LLMManager:
|
12 |
+
"""Gestor de interacción con modelos de lenguaje compatibles con la API de OpenAI."""
|
13 |
+
|
14 |
+
def __init__(self):
|
15 |
+
self.api_key = os.getenv("LLM_API_KEY")
|
16 |
+
self.base_url = os.getenv("LLM_BASE_URL")
|
17 |
+
self.model = os.getenv("LLM_MODEL_NAME")
|
18 |
+
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
19 |
+
self.prompt_system = self._load_system_prompt()
|
20 |
+
|
21 |
+
def _load_system_prompt(self) -> str:
|
22 |
+
"""Carga el prompt del sistema desde 'config/prompt_system.txt'."""
|
23 |
+
path_system_prompt = os.getenv("PATH_SYSTEM_PROMPT")
|
24 |
+
try:
|
25 |
+
with open(path_system_prompt, "r", encoding="utf-8") as f:
|
26 |
+
logger.info("✅ Prompt del sistema cargado correctamente.")
|
27 |
+
return f.read().strip()
|
28 |
+
except FileNotFoundError:
|
29 |
+
logger.warning(
|
30 |
+
f"⚠️ No se encontró '{path_system_prompt}'. Se usará un prompt por defecto."
|
31 |
+
)
|
32 |
+
return "Eres un asistente educativo del MINEDU."
|
33 |
+
|
34 |
+
def _encode_image(self, image_bytes: bytes) -> str:
|
35 |
+
"""Convierte bytes de imagen a Base64."""
|
36 |
+
logger.debug("🔄 Codificando imagen a Base64.")
|
37 |
+
return base64.b64encode(image_bytes).decode("utf-8")
|
38 |
+
|
39 |
+
def generate_response(
|
40 |
+
self, user_query: str, context: str = "", image: bytes = None
|
41 |
+
) -> str:
|
42 |
+
"""Genera respuesta multimodal (texto + imagen) o solo texto."""
|
43 |
+
try:
|
44 |
+
logger.info("🔹 Generando respuesta para la consulta del usuario.")
|
45 |
+
messages = []
|
46 |
+
|
47 |
+
# Añadir prompt del sistema
|
48 |
+
if self.prompt_system:
|
49 |
+
messages.append({"role": "system", "content": self.prompt_system})
|
50 |
+
|
51 |
+
# Si es imagen (multimodal)
|
52 |
+
if image:
|
53 |
+
logger.debug("🖼️ Procesando entrada multimodal con imagen.")
|
54 |
+
base64_image = self._encode_image(image)
|
55 |
+
|
56 |
+
messages.append(
|
57 |
+
{
|
58 |
+
"role": "user",
|
59 |
+
"content": [
|
60 |
+
{
|
61 |
+
"type": "text",
|
62 |
+
"text": user_query
|
63 |
+
if user_query
|
64 |
+
else "Describe esta imagen con enfoque educativo.",
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"type": "image_url",
|
68 |
+
"image_url": {
|
69 |
+
"url": f"data:image/png;base64,{base64_image}"
|
70 |
+
},
|
71 |
+
},
|
72 |
+
],
|
73 |
+
}
|
74 |
+
)
|
75 |
+
|
76 |
+
else:
|
77 |
+
# Solo texto, con posible contexto
|
78 |
+
full_prompt = user_query
|
79 |
+
if context:
|
80 |
+
logger.debug("➕ Añadiendo contexto al mensaje.")
|
81 |
+
full_prompt = f"{context}\n\nPregunta: {user_query}"
|
82 |
+
|
83 |
+
messages.append({"role": "user", "content": full_prompt})
|
84 |
+
|
85 |
+
# Llamada al modelo
|
86 |
+
response = self.client.chat.completions.create(
|
87 |
+
model=self.model,
|
88 |
+
messages=messages,
|
89 |
+
)
|
90 |
+
|
91 |
+
logger.success("✅ Respuesta generada correctamente.")
|
92 |
+
return response.choices[0].message.content
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
logger.error(f"❌ Error al generar respuesta: {str(e)}")
|
96 |
+
return f"Error al generar respuesta: {str(e)}"
|
core/logging/usage_logger.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Literal, Optional
|
5 |
+
|
6 |
+
# Rutas
|
7 |
+
LOG_FILE = Path("logs/registro_uso.json")
|
8 |
+
USER_STATS_FILE = Path("logs/usuarios.json")
|
9 |
+
|
10 |
+
# Crear carpeta si no existe
|
11 |
+
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
12 |
+
|
13 |
+
# Tipo de entrada admitida
|
14 |
+
TipoEntrada = Literal["Texto", "Imagen", "PDF", "DOCX", "TXT", "Otro"]
|
15 |
+
|
16 |
+
|
17 |
+
def registrar_uso(
|
18 |
+
user_id: int,
|
19 |
+
username: Optional[str],
|
20 |
+
tipo_entrada: TipoEntrada,
|
21 |
+
duracion_segundos: float,
|
22 |
+
exito: bool,
|
23 |
+
fuente: Optional[str] = "telegram_bot",
|
24 |
+
) -> None:
|
25 |
+
"""
|
26 |
+
Registra un evento de uso y actualiza el historial acumulado por usuario.
|
27 |
+
"""
|
28 |
+
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
29 |
+
log_entry = {
|
30 |
+
"timestamp": timestamp,
|
31 |
+
"user_id": user_id,
|
32 |
+
"username": username or "N/A",
|
33 |
+
"tipo_entrada": tipo_entrada,
|
34 |
+
"duracion_segundos": round(duracion_segundos, 2),
|
35 |
+
"exito": exito,
|
36 |
+
"fuente": fuente,
|
37 |
+
}
|
38 |
+
|
39 |
+
# Guardar log individual
|
40 |
+
try:
|
41 |
+
registros = []
|
42 |
+
if LOG_FILE.exists():
|
43 |
+
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
44 |
+
registros = json.load(f)
|
45 |
+
registros.append(log_entry)
|
46 |
+
with open(LOG_FILE, "w", encoding="utf-8") as f:
|
47 |
+
json.dump(registros, f, indent=2, ensure_ascii=False)
|
48 |
+
except Exception as e:
|
49 |
+
print(f"[ERROR] No se pudo guardar el registro individual: {e}")
|
50 |
+
|
51 |
+
# Actualizar conteo acumulado
|
52 |
+
try:
|
53 |
+
resumen = {}
|
54 |
+
if USER_STATS_FILE.exists():
|
55 |
+
with open(USER_STATS_FILE, "r", encoding="utf-8") as f:
|
56 |
+
resumen = json.load(f)
|
57 |
+
|
58 |
+
uid = str(user_id)
|
59 |
+
if uid not in resumen:
|
60 |
+
resumen[uid] = {
|
61 |
+
"username": username or "N/A",
|
62 |
+
"total_usos": 0,
|
63 |
+
"exitosos": 0,
|
64 |
+
"fallidos": 0,
|
65 |
+
"ultima_vez": timestamp
|
66 |
+
}
|
67 |
+
|
68 |
+
resumen[uid]["username"] = username or "N/A"
|
69 |
+
resumen[uid]["total_usos"] += 1
|
70 |
+
resumen[uid]["ultima_vez"] = timestamp
|
71 |
+
if exito:
|
72 |
+
resumen[uid]["exitosos"] += 1
|
73 |
+
else:
|
74 |
+
resumen[uid]["fallidos"] += 1
|
75 |
+
|
76 |
+
with open(USER_STATS_FILE, "w", encoding="utf-8") as f:
|
77 |
+
json.dump(resumen, f, indent=2, ensure_ascii=False)
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
print(f"[ERROR] No se pudo actualizar el resumen de usuarios: {e}")
|
core/pipeline/edullm_rag_pipeline.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# edullm_rag_pipeline.py
|
2 |
+
|
3 |
+
# =========================
|
4 |
+
# 📦 IMPORTACIONES
|
5 |
+
# =========================
|
6 |
+
import os
|
7 |
+
from typing import Union
|
8 |
+
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from loguru import logger
|
11 |
+
from pydantic import BaseModel
|
12 |
+
|
13 |
+
from core.pipeline.utils import limitar_contexto, limpiar_contexto_bruto, validar_input
|
14 |
+
from vectorstore.embeddings import EmbeddingManager
|
15 |
+
from vectorstore.distance_strategy import DistanceStrategyManager
|
16 |
+
from vectorstore.vectorstore_manager import VectorStoreManager
|
17 |
+
from llm.llm_manager import LLMManager
|
18 |
+
|
19 |
+
# =========================
|
20 |
+
# ⚙️ CONFIGURACIÓN INICIAL
|
21 |
+
# =========================
|
22 |
+
load_dotenv(dotenv_path="config/.env")
|
23 |
+
VECTORSTORE_PATH = os.getenv("VECTORSTORE_PATH", "docs/")
|
24 |
+
VECTORSTORE_NAME = os.getenv("VECTORSTORE_NAME", "edullm_store")
|
25 |
+
|
26 |
+
# =========================
|
27 |
+
# 🚀 INICIALIZACIÓN DE COMPONENTES
|
28 |
+
# =========================
|
29 |
+
embeddings = EmbeddingManager.get_embeddings()
|
30 |
+
strategy_mgr = DistanceStrategyManager()
|
31 |
+
vector_mgr = VectorStoreManager(path=VECTORSTORE_PATH, name=VECTORSTORE_NAME)
|
32 |
+
llm_manager = LLMManager()
|
33 |
+
|
34 |
+
|
35 |
+
# =========================
|
36 |
+
# 📄 MODELOS
|
37 |
+
# =========================
|
38 |
+
class Documento(BaseModel):
|
39 |
+
contenido: str
|
40 |
+
fuente: str
|
41 |
+
puntaje: float
|
42 |
+
|
43 |
+
|
44 |
+
# =========================
|
45 |
+
# 🛠️ FUNCIONES UTILITARIAS
|
46 |
+
# =========================
|
47 |
+
|
48 |
+
|
49 |
+
def init_vectorstore(force_rebuild: bool = False):
|
50 |
+
"""Inicializa o reconstruye el vectorstore si es necesario."""
|
51 |
+
if force_rebuild or not vector_mgr.exist_vectorstore():
|
52 |
+
vector_mgr.create_vectorstore()
|
53 |
+
|
54 |
+
|
55 |
+
# =========================
|
56 |
+
# 🎯 PIPELINE PRINCIPAL
|
57 |
+
# =========================
|
58 |
+
def edullm_rag_pipeline(
|
59 |
+
input_data: Union[str, bytes], top_k: int = 4, search_type: str = "similarity"
|
60 |
+
) -> str:
|
61 |
+
"""Pipeline RAG para procesamiento multimodal y generación de respuesta educativa."""
|
62 |
+
if not validar_input(input_data):
|
63 |
+
logger.error("❌ Entrada inválida. Debes proporcionar texto o imagen válida.")
|
64 |
+
return "Error: Entrada no válida."
|
65 |
+
|
66 |
+
if isinstance(input_data, bytes):
|
67 |
+
return llm_manager.generate_response(
|
68 |
+
user_query="Procesa la imagen adjunta y responde según el contexto educativo.",
|
69 |
+
image=input_data,
|
70 |
+
)
|
71 |
+
|
72 |
+
retriever = vector_mgr.as_retriever(search_type=search_type, k=top_k)
|
73 |
+
docs = retriever.invoke(input_data)
|
74 |
+
|
75 |
+
if not docs:
|
76 |
+
contexto_final = "No se encontró contexto relevante."
|
77 |
+
logger.warning("⚠️ Sin resultados en FAISS para la consulta.")
|
78 |
+
else:
|
79 |
+
contexto_bruto = "\n\n".join(d.page_content for d in docs)
|
80 |
+
contexto_limpio = limpiar_contexto_bruto(contexto_bruto)
|
81 |
+
contexto_final = limitar_contexto(contexto_limpio)
|
82 |
+
|
83 |
+
return llm_manager.generate_response(user_query=input_data, context=contexto_final)
|
core/pipeline/utils.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/pipeline/utils
|
2 |
+
import re
|
3 |
+
from typing import Union
|
4 |
+
|
5 |
+
|
6 |
+
def limpiar_contexto_bruto(contexto: str) -> str:
|
7 |
+
"""Optimiza el contexto eliminando saltos de línea redundantes, espacios excesivos y corrigiendo puntuación."""
|
8 |
+
contexto = contexto.strip()
|
9 |
+
|
10 |
+
# 1️⃣ Reemplaza saltos de línea múltiples por un espacio si hay punto antes, o por punto y espacio si no hay.
|
11 |
+
contexto = re.sub(
|
12 |
+
r"\.\s*\n+", ". ", contexto
|
13 |
+
) # Si ya hay punto antes del salto, limpia
|
14 |
+
contexto = re.sub(
|
15 |
+
r"(?<!\.)\n+", ". ", contexto
|
16 |
+
) # Si NO hay punto antes, agrega punto
|
17 |
+
|
18 |
+
# 2️⃣ Elimina espacios múltiples
|
19 |
+
contexto = re.sub(r"\s{2,}", " ", contexto)
|
20 |
+
|
21 |
+
# 3️⃣ Corrige puntos dobles o triples
|
22 |
+
contexto = re.sub(r"\.{2,}", ".", contexto)
|
23 |
+
|
24 |
+
# 4️⃣ Elimina punto al inicio si quedó por error
|
25 |
+
contexto = re.sub(r"^\.\s*", "", contexto)
|
26 |
+
|
27 |
+
# 5️⃣ Asegura que termine en punto
|
28 |
+
if not contexto.endswith("."):
|
29 |
+
contexto += "."
|
30 |
+
|
31 |
+
return contexto.strip()
|
32 |
+
|
33 |
+
|
34 |
+
def limitar_contexto(contexto: str, max_tokens: int = 1500) -> str:
|
35 |
+
"""Recorta el contexto si excede el límite de tokens."""
|
36 |
+
if len(contexto) > max_tokens:
|
37 |
+
return contexto[:max_tokens] + "\n[Contexto truncado...]"
|
38 |
+
return contexto
|
39 |
+
|
40 |
+
|
41 |
+
def validar_input(input_data: Union[str, bytes]) -> bool:
|
42 |
+
"""Valida que la entrada sea texto no vacío o bytes (imagen)."""
|
43 |
+
if isinstance(input_data, str):
|
44 |
+
return bool(input_data.strip())
|
45 |
+
if isinstance(input_data, bytes):
|
46 |
+
return True
|
47 |
+
return False
|
core/vectorstore/distance_strategy.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/vectorstore/distance_strategy
|
2 |
+
from langchain_community.vectorstores.faiss import DistanceStrategy
|
3 |
+
|
4 |
+
|
5 |
+
class DistanceStrategyManager:
|
6 |
+
"""
|
7 |
+
Gestor de estrategia de distancia para FAISS.
|
8 |
+
Estratégia fija: COSINE (vectores normalizados).
|
9 |
+
"""
|
10 |
+
|
11 |
+
_instance = None
|
12 |
+
|
13 |
+
def __new__(cls):
|
14 |
+
if cls._instance is None:
|
15 |
+
cls._instance = super().__new__(cls)
|
16 |
+
return cls._instance
|
17 |
+
|
18 |
+
@property
|
19 |
+
def strategy(self) -> DistanceStrategy:
|
20 |
+
# Estrategia única y permanente para todas las búsquedas
|
21 |
+
return DistanceStrategy.COSINE
|
core/vectorstore/document_processor.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/vectorstore/document_processor.py
|
2 |
+
import os
|
3 |
+
|
4 |
+
from langchain_community.document_loaders import (
|
5 |
+
DirectoryLoader,
|
6 |
+
Docx2txtLoader,
|
7 |
+
PyMuPDFLoader,
|
8 |
+
TextLoader,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
class DocumentProcessor:
|
13 |
+
"""Document Processor class to process files in a directory."""
|
14 |
+
|
15 |
+
def __init__(self, path: str):
|
16 |
+
"""Document Processor class to process files in a directory."""
|
17 |
+
self.path = path
|
18 |
+
|
19 |
+
def files_to_texts(self) -> list:
|
20 |
+
"""Convert files in a directory to text."""
|
21 |
+
loaders_config = {
|
22 |
+
"*.pdf": PyMuPDFLoader,
|
23 |
+
"*.txt": (TextLoader, {"encoding": "utf-8"}),
|
24 |
+
"*.docx": Docx2txtLoader,
|
25 |
+
"*.doc": Docx2txtLoader,
|
26 |
+
}
|
27 |
+
|
28 |
+
loaders = [
|
29 |
+
DirectoryLoader(
|
30 |
+
path=self.path,
|
31 |
+
glob=glob,
|
32 |
+
loader_cls=loader if isinstance(loader, type) else loader[0],
|
33 |
+
loader_kwargs=loader[1] if isinstance(loader, tuple) else None,
|
34 |
+
)
|
35 |
+
for glob, loader in loaders_config.items()
|
36 |
+
if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path))
|
37 |
+
]
|
38 |
+
|
39 |
+
documents = []
|
40 |
+
for loader in loaders:
|
41 |
+
documents.extend(loader.load())
|
42 |
+
|
43 |
+
return documents
|
core/vectorstore/embeddings.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/vectorstore/embeddings.py
|
2 |
+
import os
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
+
|
7 |
+
load_dotenv(dotenv_path="config/.env")
|
8 |
+
|
9 |
+
|
10 |
+
class EmbeddingManager:
|
11 |
+
"""
|
12 |
+
Singleton para gestionar embeddings HuggingFace normalizados (coseno).
|
13 |
+
"""
|
14 |
+
|
15 |
+
_instance = None
|
16 |
+
|
17 |
+
def __new__(cls):
|
18 |
+
if cls._instance is None:
|
19 |
+
# Leer nombre de modelo, con valor por defecto
|
20 |
+
model = os.getenv(
|
21 |
+
"MODEL_EMBEDDINGS",
|
22 |
+
"sentence-transformers/distiluse-base-multilingual-cased",
|
23 |
+
)
|
24 |
+
cls._instance = super().__new__(cls)
|
25 |
+
# Aquí creamos el atributo de instancia
|
26 |
+
cls._instance.embeddings = HuggingFaceEmbeddings(
|
27 |
+
model_name=model,
|
28 |
+
encode_kwargs={"normalize_embeddings": True},
|
29 |
+
)
|
30 |
+
return cls._instance
|
31 |
+
|
32 |
+
@classmethod
|
33 |
+
def get_embeddings(cls):
|
34 |
+
"""
|
35 |
+
Devuelve el objeto de embeddings del singleton.
|
36 |
+
"""
|
37 |
+
if cls._instance is None:
|
38 |
+
cls._instance = cls()
|
39 |
+
return cls._instance.embeddings
|
core/vectorstore/vectorstore_manager.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/vectorstore/vectorstore_manager.py
|
2 |
+
import os
|
3 |
+
import faiss
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
6 |
+
from langchain_community.vectorstores import FAISS as FAISS_STORE
|
7 |
+
from vectorstore.document_processor import DocumentProcessor
|
8 |
+
from vectorstore.embeddings import EmbeddingManager
|
9 |
+
from vectorstore.distance_strategy import DistanceStrategyManager
|
10 |
+
from loguru import logger
|
11 |
+
|
12 |
+
|
13 |
+
class VectorStoreManager:
|
14 |
+
"""
|
15 |
+
Gestión minimalista de FAISS para EDULLM:
|
16 |
+
- Indexa documentos
|
17 |
+
- Carga/guarda el índice
|
18 |
+
- Expone retriever para RAG
|
19 |
+
"""
|
20 |
+
|
21 |
+
def __init__(self, path: str, name: str):
|
22 |
+
self.path = path
|
23 |
+
self.store_path = os.path.join("database", name)
|
24 |
+
self.embeddings = EmbeddingManager.get_embeddings()
|
25 |
+
self.strategy = DistanceStrategyManager().strategy
|
26 |
+
self.vectorstore = None
|
27 |
+
logger.info(f"🔹 Inicializando VectorStoreManager en ruta: {self.store_path}")
|
28 |
+
self._initialize()
|
29 |
+
|
30 |
+
def _initialize(self):
|
31 |
+
if self.exist_vectorstore():
|
32 |
+
logger.info("✅ Índice FAISS encontrado. Cargando desde disco...")
|
33 |
+
self.vectorstore = self.load_vectorstore()
|
34 |
+
else:
|
35 |
+
logger.warning("⚠️ No existe índice previo. Creando índice vacío...")
|
36 |
+
dummy = self.embeddings.embed_query("init")
|
37 |
+
index = faiss.IndexFlatL2(len(dummy))
|
38 |
+
self.vectorstore = FAISS_STORE(
|
39 |
+
embedding_function=self.embeddings,
|
40 |
+
index=index,
|
41 |
+
docstore=InMemoryDocstore(),
|
42 |
+
index_to_docstore_id={},
|
43 |
+
distance_strategy=self.strategy,
|
44 |
+
)
|
45 |
+
|
46 |
+
def create_vectorstore(self) -> None:
|
47 |
+
logger.info(f"🚀 Procesando documentos en '{self.path}' para indexar...")
|
48 |
+
docs = DocumentProcessor(self.path).files_to_texts()
|
49 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
|
50 |
+
chunks = splitter.split_documents(docs)
|
51 |
+
self.vectorstore.add_documents(chunks)
|
52 |
+
self.save_vectorstore()
|
53 |
+
logger.success("🎯 Vectorstore creado y guardado correctamente.")
|
54 |
+
|
55 |
+
def save_vectorstore(self) -> None:
|
56 |
+
try:
|
57 |
+
os.makedirs(self.store_path, exist_ok=True)
|
58 |
+
self.vectorstore.save_local(self.store_path)
|
59 |
+
logger.info(f"💾 Índice guardado en '{self.store_path}'.")
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"❌ Error al guardar el vectorstore: {e}")
|
62 |
+
|
63 |
+
def load_vectorstore(self):
|
64 |
+
try:
|
65 |
+
logger.info(f"📂 Cargando vectorstore desde '{self.store_path}'.")
|
66 |
+
return FAISS_STORE.load_local(
|
67 |
+
folder_path=self.store_path,
|
68 |
+
embeddings=self.embeddings,
|
69 |
+
allow_dangerous_deserialization=True,
|
70 |
+
distance_strategy=self.strategy,
|
71 |
+
)
|
72 |
+
except Exception as e:
|
73 |
+
logger.error(f"❌ Error al cargar el vectorstore: {e}")
|
74 |
+
raise
|
75 |
+
|
76 |
+
def exist_vectorstore(self) -> bool:
|
77 |
+
"""Verifica si el vectorstore existe, creando la carpeta base si es necesario."""
|
78 |
+
base_dir = "database"
|
79 |
+
|
80 |
+
if not os.path.isdir(base_dir):
|
81 |
+
logger.warning(f"📂 Directorio base '{base_dir}' no encontrado. Creando...")
|
82 |
+
os.makedirs(base_dir, exist_ok=True)
|
83 |
+
return False
|
84 |
+
|
85 |
+
if os.path.isdir(self.store_path):
|
86 |
+
logger.info(f"✅ Vectorstore encontrado en '{self.store_path}'.")
|
87 |
+
return True
|
88 |
+
else:
|
89 |
+
logger.info(f"ℹ️ Vectorstore no existe aún en '{self.store_path}'.")
|
90 |
+
return False
|
91 |
+
|
92 |
+
def as_retriever(
|
93 |
+
self,
|
94 |
+
search_type: str = "similarity_score_threshold",
|
95 |
+
k: int = 4,
|
96 |
+
score_threshold: float = 0.75,
|
97 |
+
fallback_to_similarity: bool = True,
|
98 |
+
**kwargs,
|
99 |
+
):
|
100 |
+
if not self.vectorstore:
|
101 |
+
self.vectorstore = self.load_vectorstore()
|
102 |
+
|
103 |
+
logger.debug(
|
104 |
+
f"🔍 Configurando retriever: type={search_type}, k={k}, threshold={score_threshold}"
|
105 |
+
)
|
106 |
+
search_kwargs = {"k": k, "score_threshold": score_threshold}
|
107 |
+
retriever = self.vectorstore.as_retriever(
|
108 |
+
search_type=search_type, search_kwargs=search_kwargs
|
109 |
+
)
|
110 |
+
|
111 |
+
if fallback_to_similarity:
|
112 |
+
logger.info(
|
113 |
+
"🛡️ Fallback activado: Si no hay resultados, se usará búsqueda por similarity."
|
114 |
+
)
|
115 |
+
|
116 |
+
class SafeRetriever:
|
117 |
+
def __init__(self, primary, fallback):
|
118 |
+
self.primary = primary
|
119 |
+
self.fallback = fallback
|
120 |
+
|
121 |
+
def invoke(self, query):
|
122 |
+
docs = self.primary.invoke(query)
|
123 |
+
if not docs:
|
124 |
+
logger.warning(
|
125 |
+
"⚠️ Sin resultados en threshold. Aplicando fallback a similarity."
|
126 |
+
)
|
127 |
+
return self.fallback.invoke(query)
|
128 |
+
return docs
|
129 |
+
|
130 |
+
fallback_retriever = self.vectorstore.as_retriever(
|
131 |
+
search_type="similarity", search_kwargs={"k": k}
|
132 |
+
)
|
133 |
+
|
134 |
+
return SafeRetriever(retriever, fallback_retriever)
|
135 |
+
|
136 |
+
return retriever
|
database/edullm_store/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f6c2ede3f19b16d718dca7ce671ff05cd9b78791f7ed354008f04683c8d2ff91
|
3 |
+
size 3245613
|
database/edullm_store/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bacc3eb65b08b3b873d12c628f7b5d75c121c526b79d9aa1b4850ca39f8e01b
|
3 |
+
size 2286011
|
docs/curriculo-nacional-de-la-educacion-basica.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a745ca842113f53609c435df9a13eda67371164b22ccb8492b2b02167dc2299b
|
3 |
+
size 12381710
|
docs/programa-curricular-educacion-primaria_compressed.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:161de73bc723682387ff1f3e70ec9424a14c5c652518cd49723443c423852387
|
3 |
+
size 22097096
|
main.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from core.integrations.telegram_bot import start_bot
|
2 |
+
|
3 |
+
|
4 |
+
def run_telegram_bot():
|
5 |
+
start_bot()
|
6 |
+
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
try:
|
10 |
+
# Iniciar solo el bot de Telegram
|
11 |
+
run_telegram_bot()
|
12 |
+
print("✅ Bot de Telegram está en ejecución...")
|
13 |
+
except KeyboardInterrupt:
|
14 |
+
print("¡Hasta pronto!")
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faiss-cpu>=1.10.0
|
2 |
+
langchain>=0.3.24
|
3 |
+
langchain-community>=0.3.22
|
4 |
+
langchain-huggingface>=0.1.2
|
5 |
+
langchain-openai>=0.3.14
|
6 |
+
python-telegram-bot>=22.0
|
7 |
+
loguru>=0.7.3
|
8 |
+
python-dotenv>=1.0.1
|
9 |
+
pypandoc>=1.15
|
10 |
+
pillow>=11.2.1
|
11 |
+
pymupdf>=1.25.5
|
12 |
+
pypdf2>=3.0.1
|
13 |
+
python-docx>=1.1.2
|
14 |
+
reportlab>=4.4.0
|
15 |
+
docx2txt>=0.9
|
16 |
+
pytest>=8.3.5
|
17 |
+
pytest-cov>=6.1.1
|