restapitrial_vectordb

Sleeping

App Files Files Community

Redmind commited on Feb 13

Commit

a244d5b

verified ·

1 Parent(s): bbe1084

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -26

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI
 import os
-import pymupdf
 from pptx import Presentation  # PowerPoint
 from sentence_transformers import SentenceTransformer  # Text embeddings
 import torch
@@ -11,33 +11,34 @@ import numpy as np
 from sklearn.decomposition import PCA
 app = FastAPI()
 client = chromadb.PersistentClient(path="/data/chroma_db")
-collection = client.get_or_create_collection(name="knowledge_base")
 pdf_file = "Sutures and Suturing techniques.pdf"
 pptx_file = "impalnt 1.pptx"
 # Initialize models
-text_model = SentenceTransformer('all-MiniLM-L6-v2')
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 IMAGE_FOLDER = "/data/extracted_images"
 os.makedirs(IMAGE_FOLDER, exist_ok=True)
 # Extract text from PDF
 def extract_text_from_pdf(pdf_path):
-    text = "".join([page.get_text() for page in pymupdf.open(pdf_path)])
-    return text.strip()
 # Extract text from PowerPoint
 def extract_text_from_pptx(pptx_path):
-    return "".join([shape.text for slide in Presentation(pptx_path).slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
 # Extract images from PDF
 def extract_images_from_pdf(pdf_path):
     images = []
-    doc = pymupdf.open(pdf_path)
     for i, page in enumerate(doc):
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
@@ -63,34 +64,32 @@ def extract_images_from_pptx(pptx_path):
 # Convert text to embeddings
 def get_text_embedding(text):
-    return text_model.encode(text).tolist()
 # Extract image embeddings
 def get_image_embedding(image_path):
     image = Image.open(image_path)
-    inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
-        image_embedding = model.get_image_features(**inputs).numpy().flatten()
     return image_embedding.tolist()
 # Store Data in ChromaDB
 def store_data(texts, image_paths):
     for i, text in enumerate(texts):
         collection.add(ids=[f"text_{i}"], embeddings=[get_text_embedding(text)], documents=[text])
-    # Collect image embeddings first
-    all_embeddings = [get_image_embedding(img_path) for img_path in image_paths]
-    all_embeddings = np.array(all_embeddings)
-    # Apply PCA if enough images exist
-    if all_embeddings.shape[0] >= 384:
-        pca = PCA(n_components=384)
-        transformed_embeddings = pca.fit_transform(all_embeddings)
-    else:
-        transformed_embeddings = all_embeddings  # Use original embeddings
-    for j, img_path in enumerate(image_paths):
-        collection.add(ids=[f"image_{j}"], embeddings=[transformed_embeddings[j].tolist()], documents=[img_path])
     print("Data stored successfully!")
@@ -119,4 +118,4 @@ def greet_json():
 def search(query: str):
     query_embedding = get_text_embedding(query)
     results = collection.query(query_embeddings=[query_embedding], n_results=5)
-    return {"results": results["documents"]}

 from fastapi import FastAPI
 import os
+import fitz  # pymupdf
 from pptx import Presentation  # PowerPoint
 from sentence_transformers import SentenceTransformer  # Text embeddings
 import torch
 from sklearn.decomposition import PCA
 app = FastAPI()
+# Initialize ChromaDB
 client = chromadb.PersistentClient(path="/data/chroma_db")
+collection = client.get_or_create_collection(name="knowledge_base", metadata={"hnsw:space": "cosine"})
 pdf_file = "Sutures and Suturing techniques.pdf"
 pptx_file = "impalnt 1.pptx"
 # Initialize models
+text_model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2')  # 384-dim text model
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 IMAGE_FOLDER = "/data/extracted_images"
 os.makedirs(IMAGE_FOLDER, exist_ok=True)
 # Extract text from PDF
 def extract_text_from_pdf(pdf_path):
+    return " ".join([page.get_text() for page in fitz.open(pdf_path)]).strip()
 # Extract text from PowerPoint
 def extract_text_from_pptx(pptx_path):
+    return " ".join([shape.text for slide in Presentation(pptx_path).slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
 # Extract images from PDF
 def extract_images_from_pdf(pdf_path):
     images = []
+    doc = fitz.open(pdf_path)
     for i, page in enumerate(doc):
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
 # Convert text to embeddings
 def get_text_embedding(text):
+    return text_model.encode(text).tolist()  # 384-dim output
 # Extract image embeddings
 def get_image_embedding(image_path):
     image = Image.open(image_path)
+    inputs = clip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
+        image_embedding = clip_model.get_image_features(**inputs).numpy().flatten()  # 512-dim output
     return image_embedding.tolist()
+# Reduce image embedding dimensionality (512 → 384)
+def reduce_embedding_dim(embeddings):
+    pca = PCA(n_components=384)
+    return pca.fit_transform(np.array(embeddings))
 # Store Data in ChromaDB
 def store_data(texts, image_paths):
     for i, text in enumerate(texts):
         collection.add(ids=[f"text_{i}"], embeddings=[get_text_embedding(text)], documents=[text])
+    if image_paths:
+        all_embeddings = np.array([get_image_embedding(img_path) for img_path in image_paths])
+        transformed_embeddings = reduce_embedding_dim(all_embeddings) if all_embeddings.shape[1] > 384 else all_embeddings
+        for j, img_path in enumerate(image_paths):
+            collection.add(ids=[f"image_{j}"], embeddings=[transformed_embeddings[j].tolist()], documents=[img_path])
     print("Data stored successfully!")
 def search(query: str):
     query_embedding = get_text_embedding(query)
     results = collection.query(query_embeddings=[query_embedding], n_results=5)
+    return {"results": results["documents"][0] if results["documents"] else []}  # Fix empty results handling