Update app.py
Browse files
app.py
CHANGED
@@ -39,12 +39,18 @@ def save_pdf(file, title):
|
|
39 |
# Extract text
|
40 |
reader = PdfReader(file.name)
|
41 |
full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
|
|
|
42 |
|
43 |
# Chunk text
|
44 |
chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
|
45 |
|
46 |
# Embed and index
|
47 |
embeddings = embedder.encode(chunks)
|
|
|
|
|
|
|
|
|
|
|
48 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
49 |
index.add(embeddings)
|
50 |
|
@@ -59,7 +65,7 @@ def save_pdf(file, title):
|
|
59 |
upload_to_hub(index_path, f"data/{title}/index.faiss")
|
60 |
upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
|
61 |
|
62 |
-
return f"✅ Saved and indexed '{title}', and uploaded to Hub."
|
63 |
|
64 |
# Return all available PDF titles
|
65 |
def list_titles():
|
|
|
39 |
# Extract text
|
40 |
reader = PdfReader(file.name)
|
41 |
full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
|
42 |
+
print(full_text)
|
43 |
|
44 |
# Chunk text
|
45 |
chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
|
46 |
|
47 |
# Embed and index
|
48 |
embeddings = embedder.encode(chunks)
|
49 |
+
|
50 |
+
print("Embeddings shape:", embeddings.shape)
|
51 |
+
if len(embeddings.shape) != 2:
|
52 |
+
raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
|
53 |
+
|
54 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
55 |
index.add(embeddings)
|
56 |
|
|
|
65 |
upload_to_hub(index_path, f"data/{title}/index.faiss")
|
66 |
upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
|
67 |
|
68 |
+
return f"✅ Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
|
69 |
|
70 |
# Return all available PDF titles
|
71 |
def list_titles():
|