Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from fastapi import FastAPI
|
2 |
import os
|
3 |
-
import
|
4 |
from pptx import Presentation
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import torch
|
@@ -11,18 +11,18 @@ import numpy as np
|
|
11 |
|
12 |
app = FastAPI()
|
13 |
|
14 |
-
# Initialize ChromaDB
|
15 |
client = chromadb.PersistentClient(path="/data/chroma_db")
|
16 |
-
collection = client.get_or_create_collection(name="knowledge_base")
|
17 |
|
18 |
# File Paths
|
19 |
pdf_file = "Sutures and Suturing techniques.pdf"
|
20 |
pptx_file = "impalnt 1.pptx"
|
21 |
|
22 |
# Initialize Embedding Models
|
23 |
-
text_model = SentenceTransformer('paraphrase-MiniLM-L12-v2') # 512D embeddings
|
24 |
-
|
25 |
-
|
26 |
|
27 |
# Image Storage Folder
|
28 |
IMAGE_FOLDER = "/data/extracted_images"
|
@@ -31,7 +31,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
|
31 |
# Extract Text from PDF
|
32 |
def extract_text_from_pdf(pdf_path):
|
33 |
try:
|
34 |
-
doc =
|
35 |
text = " ".join(page.get_text() for page in doc)
|
36 |
return text.strip() if text else None
|
37 |
except Exception as e:
|
@@ -53,7 +53,7 @@ def extract_text_from_pptx(pptx_path):
|
|
53 |
# Extract Images from PDF
|
54 |
def extract_images_from_pdf(pdf_path):
|
55 |
try:
|
56 |
-
doc =
|
57 |
images = []
|
58 |
for i, page in enumerate(doc):
|
59 |
for img_index, img in enumerate(page.get_images(full=True)):
|
@@ -93,9 +93,9 @@ def get_text_embedding(text):
|
|
93 |
def get_image_embedding(image_path):
|
94 |
try:
|
95 |
image = Image.open(image_path)
|
96 |
-
inputs =
|
97 |
with torch.no_grad():
|
98 |
-
image_embedding =
|
99 |
return image_embedding.tolist()
|
100 |
except Exception as e:
|
101 |
print(f"Error generating image embedding: {e}")
|
|
|
1 |
from fastapi import FastAPI
|
2 |
import os
|
3 |
+
import fitz # PyMuPDF
|
4 |
from pptx import Presentation
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import torch
|
|
|
11 |
|
12 |
app = FastAPI()
|
13 |
|
14 |
+
# Initialize ChromaDB with 512 dimensions
|
15 |
client = chromadb.PersistentClient(path="/data/chroma_db")
|
16 |
+
collection = client.get_or_create_collection(name="knowledge_base", metadata={"hnsw:space": "cosine"}, embedding_function=None)
|
17 |
|
18 |
# File Paths
|
19 |
pdf_file = "Sutures and Suturing techniques.pdf"
|
20 |
pptx_file = "impalnt 1.pptx"
|
21 |
|
22 |
# Initialize Embedding Models
|
23 |
+
text_model = SentenceTransformer('paraphrase-MiniLM-L12-v2') # 512D text embeddings
|
24 |
+
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
25 |
+
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
26 |
|
27 |
# Image Storage Folder
|
28 |
IMAGE_FOLDER = "/data/extracted_images"
|
|
|
31 |
# Extract Text from PDF
|
32 |
def extract_text_from_pdf(pdf_path):
|
33 |
try:
|
34 |
+
doc = fitz.open(pdf_path)
|
35 |
text = " ".join(page.get_text() for page in doc)
|
36 |
return text.strip() if text else None
|
37 |
except Exception as e:
|
|
|
53 |
# Extract Images from PDF
|
54 |
def extract_images_from_pdf(pdf_path):
|
55 |
try:
|
56 |
+
doc = fitz.open(pdf_path)
|
57 |
images = []
|
58 |
for i, page in enumerate(doc):
|
59 |
for img_index, img in enumerate(page.get_images(full=True)):
|
|
|
93 |
def get_image_embedding(image_path):
|
94 |
try:
|
95 |
image = Image.open(image_path)
|
96 |
+
inputs = clip_processor(images=image, return_tensors="pt")
|
97 |
with torch.no_grad():
|
98 |
+
image_embedding = clip_model.get_image_features(**inputs).squeeze().numpy()
|
99 |
return image_embedding.tolist()
|
100 |
except Exception as e:
|
101 |
print(f"Error generating image embedding: {e}")
|