restapitrial_vectordb

Sleeping

App Files Files Community

Redmind commited on Feb 14

Commit

c2710ab

verified ·

1 Parent(s): 1371299

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -17

app.py CHANGED Viewed

@@ -8,24 +8,22 @@ from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import chromadb
 import numpy as np
 app = FastAPI()
-# Initialize ChromaDB with 512 dimensions
 client = chromadb.PersistentClient(path="/data/chroma_db")
-client.delete_collection(name="knowledge_base")
-collection = client.get_or_create_collection(name="knowledge_base", metadata={"dim": 512})
-#collection = client.get_or_create_collection(name="knowledge_base", metadata={"hnsw:space": "cosine"}, embedding_function=None)
 # File Paths
 pdf_file = "Sutures and Suturing techniques.pdf"
 pptx_file = "impalnt 1.pptx"
 # Initialize Embedding Models
-text_model = SentenceTransformer('paraphrase-MiniLM-L12-v2')  # 512D text embeddings
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Image Storage Folder
 IMAGE_FOLDER = "/data/extracted_images"
@@ -88,17 +86,23 @@ def extract_images_from_pptx(pptx_path):
         print(f"Error extracting images from PPTX: {e}")
         return []
-# Convert Text to Embeddings (512D)
 def get_text_embedding(text):
     return text_model.encode(text).tolist()
-# Extract Image Embeddings (512D)
 def get_image_embedding(image_path):
     try:
         image = Image.open(image_path)
-        inputs = clip_processor(images=image, return_tensors="pt")
         with torch.no_grad():
-            image_embedding = clip_model.get_image_features(**inputs).squeeze().numpy()
         return image_embedding.tolist()
     except Exception as e:
         print(f"Error generating image embedding: {e}")
@@ -109,12 +113,21 @@ def store_data(texts, image_paths):
     for i, text in enumerate(texts):
         if text:
             text_embedding = get_text_embedding(text)
-            collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
-    for j, img_path in enumerate(image_paths):
-        img_embedding = get_image_embedding(img_path)
-        if img_embedding:
-            collection.add(ids=[f"image_{j}"], embeddings=[img_embedding], documents=[img_path])
     print("Data stored successfully!")
@@ -133,6 +146,8 @@ def process_and_store(pdf_path=None, pptx_path=None):
         images.extend(extract_images_from_pptx(pptx_path))
     store_data(texts, images)
 # FastAPI Endpoints
 @app.get("/")
 def greet_json():

 from PIL import Image
 import chromadb
 import numpy as np
+from sklearn.decomposition import PCA
 app = FastAPI()
+# Initialize ChromaDB
 client = chromadb.PersistentClient(path="/data/chroma_db")
+collection = client.get_or_create_collection(name="knowledge_base")
 # File Paths
 pdf_file = "Sutures and Suturing techniques.pdf"
 pptx_file = "impalnt 1.pptx"
 # Initialize Embedding Models
+text_model = SentenceTransformer('all-MiniLM-L6-v2')
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Image Storage Folder
 IMAGE_FOLDER = "/data/extracted_images"
         print(f"Error extracting images from PPTX: {e}")
         return []
+# Convert Text to Embeddings
 def get_text_embedding(text):
     return text_model.encode(text).tolist()
+# Extract Image Embeddings and Reduce to 384 Dimensions
 def get_image_embedding(image_path):
     try:
         image = Image.open(image_path)
+        inputs = processor(images=image, return_tensors="pt")
         with torch.no_grad():
+            image_embedding = model.get_image_features(**inputs).numpy().flatten()
+        # Ensure embedding is 384-dimensional
+        if len(image_embedding) != 384:
+            pca = PCA(n_components=384)
+            image_embedding = pca.fit_transform(image_embedding.reshape(1, -1)).flatten()
         return image_embedding.tolist()
     except Exception as e:
         print(f"Error generating image embedding: {e}")
     for i, text in enumerate(texts):
         if text:
             text_embedding = get_text_embedding(text)
+            if len(text_embedding) == 384:
+                collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
+    all_embeddings = [get_image_embedding(img_path) for img_path in image_paths if get_image_embedding(img_path) is not None]
+    if all_embeddings:
+        all_embeddings = np.array(all_embeddings)
+        # Apply PCA only if necessary
+        if all_embeddings.shape[1] != 384:
+            pca = PCA(n_components=384)
+            all_embeddings = pca.fit_transform(all_embeddings)
+        for j, img_path in enumerate(image_paths):
+            collection.add(ids=[f"image_{j}"], embeddings=[all_embeddings[j].tolist()], documents=[img_path])
     print("Data stored successfully!")
         images.extend(extract_images_from_pptx(pptx_path))
     store_data(texts, images)
 # FastAPI Endpoints
 @app.get("/")
 def greet_json():