Merlintxu commited on
Commit
4d8597b
verified
1 Parent(s): ac2e8a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -4
app.py CHANGED
@@ -1,7 +1,120 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import yaml
3
+ import json
4
+ import uuid
5
+ from pathlib import Path
6
+ from docx import Document
7
+ import PyPDF2
8
+ from sentence_transformers import SentenceTransformer
9
+ import tiktoken
10
+ import os
11
 
12
+ # Carga modelo de embeddings de HF
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+ # Tokenizer para chunking
15
+ tokenizer = tiktoken.get_encoding("cl100k_base")
16
 
17
+ # Extrae front-matter YAML (si existe) y cuerpo
18
+ def extract_front_matter_and_body(text: str):
19
+ import re
20
+ fm_regex = r"^---\n(.*?)\n---\n(.*)$"
21
+ m = re.match(fm_regex, text, re.DOTALL)
22
+ if m:
23
+ meta = yaml.safe_load(m.group(1)) or {}
24
+ body = m.group(2)
25
+ else:
26
+ meta = {}
27
+ body = text
28
+ return meta, body
29
+
30
+ # Chunking en base a tokens
31
+ def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
32
+ tokens = tokenizer.encode(text)
33
+ chunks = []
34
+ start = 0
35
+ while start < len(tokens):
36
+ end = min(start + max_tokens, len(tokens))
37
+ chunk_toks = tokens[start:end]
38
+ chunks.append(tokenizer.decode(chunk_toks))
39
+ start += max_tokens - overlap
40
+ return chunks
41
+
42
+ # Procesa un archivo individual (md/docx/pdf)
43
+ def process_file(path: str, vertical: str, language: str):
44
+ ext = Path(path).suffix.lower()
45
+ # Leer y extraer texto
46
+ if ext in ['.md', '.markdown']:
47
+ raw = Path(path).read_text(encoding='utf-8')
48
+ meta, body = extract_front_matter_and_body(raw)
49
+ elif ext == '.docx':
50
+ doc = Document(path)
51
+ body = "\n".join(p.text for p in doc.paragraphs)
52
+ meta = {}
53
+ elif ext == '.pdf':
54
+ reader = PyPDF2.PdfReader(path)
55
+ pages = [page.extract_text() or "" for page in reader.pages]
56
+ body = "\n".join(pages)
57
+ meta = {}
58
+ else:
59
+ return []
60
+
61
+ # Metadatos por defecto + front-matter
62
+ default_meta = {
63
+ 'vertical': vertical,
64
+ 'language': language,
65
+ 'source': Path(path).name
66
+ }
67
+ meta = {**default_meta, **meta}
68
+
69
+ # Chunking y embeddings
70
+ records = []
71
+ for i, chunk in enumerate(chunk_text(body)):
72
+ emb = model.encode(chunk).tolist()
73
+ metadata = {
74
+ 'id': f"{Path(path).stem}-chunk-{i+1:04d}",
75
+ 'chunk_index': i+1,
76
+ **meta
77
+ }
78
+ records.append({ 'vector': emb, 'metadata': metadata })
79
+ return records
80
+
81
+ # Funci贸n para el bot贸n
82
+ def run_pipeline(files, vertical, language):
83
+ all_records = []
84
+ # Guardar temporalmente y procesar
85
+ for file in files:
86
+ # Gradio pasa un dict con 'name' y 'data'
87
+ tmp_path = file.name
88
+ os.replace(file.name, tmp_path)
89
+ recs = process_file(tmp_path, vertical, language)
90
+ all_records.extend(recs)
91
+
92
+ # Generar JSONL
93
+ out_file = f"/tmp/{uuid.uuid4().hex}.jsonl"
94
+ with open(out_file, 'w', encoding='utf-8') as f:
95
+ for rec in all_records:
96
+ json.dump({ 'id': rec['metadata']['id'],
97
+ 'vector': rec['vector'],
98
+ 'metadata': rec['metadata']
99
+ }, f, ensure_ascii=False)
100
+ f.write("\n")
101
+
102
+ return out_file
103
+
104
+ # Interfaz Gradio
105
+ demo = gr.Blocks()
106
+ with demo:
107
+ gr.Markdown("## Ingesta para Amazon S3 Vector Features")
108
+ with gr.Row():
109
+ uploader = gr.File(label="Sube tus documentos", file_count="multiple", type="file")
110
+ vertical = gr.Textbox(label="Vertical (p.ej. SEO, eCommerce)", value="general")
111
+ language = gr.Textbox(label="Idioma", value="es")
112
+ btn = gr.Button("Procesar y Generar JSONL")
113
+ output = gr.File(label="Descarga el JSONL")
114
+
115
+ btn.click(fn=run_pipeline,
116
+ inputs=[uploader, vertical, language],
117
+ outputs=output)
118
+
119
+ if __name__ == "__main__":
120
+ demo.launch()