restapitrial_vectordb

Sleeping

App Files Files Community

Redmind commited on Feb 13

Commit

a928ae7

verified ·

1 Parent(s): 416df2e

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -28

app.py CHANGED Viewed

@@ -1,10 +1,34 @@
 from fastapi import FastAPI
 app = FastAPI()
-client = chromadb.PersistentClient(path="./chroma_db")
 collection = client.get_collection(name="knowledge_base")
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
 @app.get("/test")
@@ -20,17 +44,18 @@ def search(query: str):
     )
     return {"results": results["documents"]}
-import fitz
 def extract_text_from_pdf(pdf_path):
     text = ""
     doc = fitz.open(pdf_path)
     for page in doc:
         text += page.get_text() + "\n"
-    return text
-from pptx import Presentation
 def extract_text_from_pptx(pptx_path):
     text = ""
     prs = Presentation(pptx_path)
@@ -38,24 +63,29 @@ def extract_text_from_pptx(pptx_path):
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text += shape.text + "\n"
-    return text
-import os
-def extract_images_from_pdf(pdf_path, output_folder):
     doc = fitz.open(pdf_path)
-    os.makedirs(output_folder, exist_ok=True)
     for i, page in enumerate(doc):
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
             image = doc.extract_image(xref)
             img_bytes = image["image"]
             img_ext = image["ext"]
-            with open(f"{output_folder}/image_{i}_{img_index}.{img_ext}", "wb") as f:
                 f.write(img_bytes)
-def extract_images_from_pptx(pptx_path, output_folder):
-    os.makedirs(output_folder, exist_ok=True)
     prs = Presentation(pptx_path)
     for i, slide in enumerate(prs.slides):
         for shape in slide.shapes:
@@ -63,38 +93,54 @@ def extract_images_from_pptx(pptx_path, output_folder):
                 image = shape.image
                 img_bytes = image.blob
                 img_ext = image.ext
-                with open(f"{output_folder}/image_{i}.{img_ext}", "wb") as f:
                     f.write(img_bytes)
-from sentence_transformers import SentenceTransformer
-model = SentenceTransformer('all-MiniLM-L6-v2')
 def get_text_embedding(text):
-    return model.encode(text).tolist()
-from PIL import Image
-import torch
-from transformers import CLIPProcessor, CLIPModel
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 def get_image_embedding(image_path):
     image = Image.open(image_path)
-    inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
         embedding = clip_model.get_image_features(**inputs)
     return embedding.squeeze().tolist()
-import chromadb
-client = chromadb.PersistentClient(path="./chroma_db")
-collection = client.get_or_create_collection(name="knowledge_base")
-def store_data(texts, images):
     for i, text in enumerate(texts):
         text_embedding = get_text_embedding(text)
         collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
-    for j, image in enumerate(images):
-        image_embedding = get_image_embedding(image)
-        collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image])

 from fastapi import FastAPI
+import os
+import fitz  # PyMuPDF for PDFs
+from pptx import Presentation  # python-pptx for PowerPoint
+from sentence_transformers import SentenceTransformer  # Text embeddings
+import torch
+from transformers import CLIPProcessor, CLIPModel  # Image embeddings
+from PIL import Image
+import chromadb
 app = FastAPI()
+client = chromadb.PersistentClient(path="/data/chroma_db")
 collection = client.get_collection(name="knowledge_base")
+pdf_file="Sutures and Suturing techniques.pdf"
+pptx_file="impalnt 1.pptx"
+process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
+# Initialize models
+text_model = SentenceTransformer('all-MiniLM-L6-v2')
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+# Folder for extracted images
+IMAGE_FOLDER = "/data/extracted_images"
+os.makedirs(IMAGE_FOLDER, exist_ok=True)
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
 @app.get("/test")
     )
     return {"results": results["documents"]}
+### Step 1: Extract Text from PDF ###
 def extract_text_from_pdf(pdf_path):
     text = ""
     doc = fitz.open(pdf_path)
     for page in doc:
         text += page.get_text() + "\n"
+    return text.strip()
+### Step 2: Extract Text from PowerPoint ###
 def extract_text_from_pptx(pptx_path):
     text = ""
     prs = Presentation(pptx_path)
         for shape in slide.shapes:
             if hasattr(shape, "text"):
                 text += shape.text + "\n"
+    return text.strip()
+### Step 3: Extract Images from PDF ###
+def extract_images_from_pdf(pdf_path):
+    images = []
     doc = fitz.open(pdf_path)
     for i, page in enumerate(doc):
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
             image = doc.extract_image(xref)
             img_bytes = image["image"]
             img_ext = image["ext"]
+            img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}"
+            with open(img_path, "wb") as f:
                 f.write(img_bytes)
+            images.append(img_path)
+    return images
+### Step 4: Extract Images from PowerPoint ###
+def extract_images_from_pptx(pptx_path):
+    images = []
     prs = Presentation(pptx_path)
     for i, slide in enumerate(prs.slides):
         for shape in slide.shapes:
                 image = shape.image
                 img_bytes = image.blob
                 img_ext = image.ext
+                img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}"
+                with open(img_path, "wb") as f:
                     f.write(img_bytes)
+                images.append(img_path)
+    return images
+### Step 5: Convert Text to Embeddings ###
 def get_text_embedding(text):
+    return text_model.encode(text).tolist()
+### Step 6: Convert Images to Embeddings ###
 def get_image_embedding(image_path):
     image = Image.open(image_path)
+    inputs = clip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
         embedding = clip_model.get_image_features(**inputs)
     return embedding.squeeze().tolist()
+### Step 7: Store Data in ChromaDB ###
+def store_data(texts, image_paths):
+    # Store text embeddings
     for i, text in enumerate(texts):
         text_embedding = get_text_embedding(text)
         collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
+    # Store image embeddings
+    for j, image_path in enumerate(image_paths):
+        image_embedding = get_image_embedding(image_path)
+        collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path])
+    print("Data stored successfully!")
+### Step 8: Process and Store from Files ###
+def process_and_store(pdf_path=None, pptx_path=None):
+    texts, images = [], []
+    if pdf_path:
+        print(f"Processing PDF: {pdf_path}")
+        texts.append(extract_text_from_pdf(pdf_path))
+        images.extend(extract_images_from_pdf(pdf_path))
+    if pptx_path:
+        print(f"Processing PPTX: {pptx_path}")
+        texts.append(extract_text_from_pptx(pptx_path))
+        images.extend(extract_images_from_pptx(pptx_path))
+    store_data(texts, images)