from fastapi import FastAPI import os import pymupdf from pptx import Presentation # python-pptx for PowerPoint from sentence_transformers import SentenceTransformer # Text embeddings import torch from transformers import CLIPProcessor, CLIPModel # Image embeddings from PIL import Image import chromadb app = FastAPI() client = chromadb.PersistentClient(path="/data/chroma_db") collection = client.get_collection(name="knowledge_base") pdf_file="Sutures and Suturing techniques.pdf" pptx_file="impalnt 1.pptx" process_and_store(pdf_path=pdf_file, pptx_path=pptx_file) # Initialize models text_model = SentenceTransformer('all-MiniLM-L6-v2') clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # Folder for extracted images IMAGE_FOLDER = "/data/extracted_images" os.makedirs(IMAGE_FOLDER, exist_ok=True) @app.get("/") def greet_json(): return {"Hello": "World!"} @app.get("/test") def greet_json(): return {"Hello": "Redmind!"} @app.get("/search/") def search(query: str): query_embedding = get_text_embedding(query) results = collection.query( query_embeddings=[query_embedding], n_results=5 ) return {"results": results["documents"]} ### Step 1: Extract Text from PDF ### def extract_text_from_pdf(pdf_path): text = "" doc = pymupdf.open(pdf_path) for page in doc: text += page.get_text() + "\n" return text.strip() ### Step 2: Extract Text from PowerPoint ### def extract_text_from_pptx(pptx_path): text = "" prs = Presentation(pptx_path) for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text += shape.text + "\n" return text.strip() ### Step 3: Extract Images from PDF ### def extract_images_from_pdf(pdf_path): images = [] doc = fitz.open(pdf_path) for i, page in enumerate(doc): for img_index, img in enumerate(page.get_images(full=True)): xref = img[0] image = doc.extract_image(xref) img_bytes = image["image"] img_ext = image["ext"] img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}" with open(img_path, "wb") as f: f.write(img_bytes) images.append(img_path) return images ### Step 4: Extract Images from PowerPoint ### def extract_images_from_pptx(pptx_path): images = [] prs = Presentation(pptx_path) for i, slide in enumerate(prs.slides): for shape in slide.shapes: if shape.shape_type == 13: # Picture shape type image = shape.image img_bytes = image.blob img_ext = image.ext img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}" with open(img_path, "wb") as f: f.write(img_bytes) images.append(img_path) return images ### Step 5: Convert Text to Embeddings ### def get_text_embedding(text): return text_model.encode(text).tolist() ### Step 6: Convert Images to Embeddings ### def get_image_embedding(image_path): image = Image.open(image_path) inputs = clip_processor(images=image, return_tensors="pt") with torch.no_grad(): embedding = clip_model.get_image_features(**inputs) return embedding.squeeze().tolist() ### Step 7: Store Data in ChromaDB ### def store_data(texts, image_paths): # Store text embeddings for i, text in enumerate(texts): text_embedding = get_text_embedding(text) collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text]) # Store image embeddings for j, image_path in enumerate(image_paths): image_embedding = get_image_embedding(image_path) collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path]) print("Data stored successfully!") ### Step 8: Process and Store from Files ### def process_and_store(pdf_path=None, pptx_path=None): texts, images = [], [] if pdf_path: print(f"Processing PDF: {pdf_path}") texts.append(extract_text_from_pdf(pdf_path)) images.extend(extract_images_from_pdf(pdf_path)) if pptx_path: print(f"Processing PPTX: {pptx_path}") texts.append(extract_text_from_pptx(pptx_path)) images.extend(extract_images_from_pptx(pptx_path)) store_data(texts, images)