Spaces:
Sleeping
Sleeping
File size: 4,563 Bytes
be3a2ca a928ae7 c02416d f081ce4 a928ae7 be3a2ca a928ae7 16af574 a928ae7 6e9858c 16af574 a928ae7 b2cfabe f081ce4 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 4077f41 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 b2cfabe a928ae7 4314dbc a928ae7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
from fastapi import FastAPI
import os
import pymupdf
from pptx import Presentation # python-pptx for PowerPoint
from sentence_transformers import SentenceTransformer # Text embeddings
import torch
from transformers import CLIPProcessor, CLIPModel # Image embeddings
from PIL import Image
import chromadb
app = FastAPI()
client = chromadb.PersistentClient(path="/data/chroma_db")
collection = client.get_or_create_collection(name="knowledge_base")
pdf_file="Sutures and Suturing techniques.pdf"
pptx_file="impalnt 1.pptx"
collection = client.get_collection(name="knowledge_base")
# Initialize models
text_model = SentenceTransformer('all-MiniLM-L6-v2')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Folder for extracted images
IMAGE_FOLDER = "/data/extracted_images"
os.makedirs(IMAGE_FOLDER, exist_ok=True)
### Step 1: Extract Text from PDF ###
def extract_text_from_pdf(pdf_path):
text = ""
doc = pymupdf.open(pdf_path)
for page in doc:
text += page.get_text() + "\n"
return text.strip()
### Step 2: Extract Text from PowerPoint ###
def extract_text_from_pptx(pptx_path):
text = ""
prs = Presentation(pptx_path)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text.strip()
### Step 3: Extract Images from PDF ###
def extract_images_from_pdf(pdf_path):
images = []
doc = pymupdf.open(pdf_path)
for i, page in enumerate(doc):
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
image = doc.extract_image(xref)
img_bytes = image["image"]
img_ext = image["ext"]
img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}"
with open(img_path, "wb") as f:
f.write(img_bytes)
images.append(img_path)
return images
### Step 4: Extract Images from PowerPoint ###
def extract_images_from_pptx(pptx_path):
images = []
prs = Presentation(pptx_path)
for i, slide in enumerate(prs.slides):
for shape in slide.shapes:
if shape.shape_type == 13: # Picture shape type
image = shape.image
img_bytes = image.blob
img_ext = image.ext
img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}"
with open(img_path, "wb") as f:
f.write(img_bytes)
images.append(img_path)
return images
### Step 5: Convert Text to Embeddings ###
def get_text_embedding(text):
return text_model.encode(text).tolist()
### Step 6: Convert Images to Embeddings ###
def get_image_embedding(image_path):
image = Image.open(image_path)
inputs = clip_processor(images=image, return_tensors="pt")
with torch.no_grad():
embedding = clip_model.get_image_features(**inputs)
return embedding.squeeze().tolist()
### Step 7: Store Data in ChromaDB ###
def store_data(texts, image_paths):
# Store text embeddings
for i, text in enumerate(texts):
text_embedding = get_text_embedding(text)
collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
# Store image embeddings
for j, image_path in enumerate(image_paths):
image_embedding = get_image_embedding(image_path)
collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path])
print("Data stored successfully!")
### Step 8: Process and Store from Files ###
def process_and_store(pdf_path=None, pptx_path=None):
texts, images = [], []
if pdf_path:
print(f"Processing PDF: {pdf_path}")
texts.append(extract_text_from_pdf(pdf_path))
images.extend(extract_images_from_pdf(pdf_path))
if pptx_path:
print(f"Processing PPTX: {pptx_path}")
texts.append(extract_text_from_pptx(pptx_path))
images.extend(extract_images_from_pptx(pptx_path))
store_data(texts, images)
process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.get("/test")
def greet_json():
return {"Hello": "Redmind!"}
@app.get("/search/")
def search(query: str):
query_embedding = get_text_embedding(query)
results = collection.query(
query_embeddings=[query_embedding],
n_results=5
)
return {"results": results["documents"]}
|