Redmind commited on
Commit
77175c7
·
verified ·
1 Parent(s): 4e117fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -10
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI
2
  import os
3
- import pymupdf # PyMuPDF
4
  from pptx import Presentation
5
  from sentence_transformers import SentenceTransformer
6
  import torch
@@ -11,18 +11,18 @@ import numpy as np
11
 
12
  app = FastAPI()
13
 
14
- # Initialize ChromaDB
15
  client = chromadb.PersistentClient(path="/data/chroma_db")
16
- collection = client.get_or_create_collection(name="knowledge_base")
17
 
18
  # File Paths
19
  pdf_file = "Sutures and Suturing techniques.pdf"
20
  pptx_file = "impalnt 1.pptx"
21
 
22
  # Initialize Embedding Models
23
- text_model = SentenceTransformer('paraphrase-MiniLM-L12-v2') # 512D embeddings
24
- model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
25
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
26
 
27
  # Image Storage Folder
28
  IMAGE_FOLDER = "/data/extracted_images"
@@ -31,7 +31,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
31
  # Extract Text from PDF
32
  def extract_text_from_pdf(pdf_path):
33
  try:
34
- doc = pymupdf.open(pdf_path)
35
  text = " ".join(page.get_text() for page in doc)
36
  return text.strip() if text else None
37
  except Exception as e:
@@ -53,7 +53,7 @@ def extract_text_from_pptx(pptx_path):
53
  # Extract Images from PDF
54
  def extract_images_from_pdf(pdf_path):
55
  try:
56
- doc = pymupdf.open(pdf_path)
57
  images = []
58
  for i, page in enumerate(doc):
59
  for img_index, img in enumerate(page.get_images(full=True)):
@@ -93,9 +93,9 @@ def get_text_embedding(text):
93
  def get_image_embedding(image_path):
94
  try:
95
  image = Image.open(image_path)
96
- inputs = processor(images=image, return_tensors="pt")
97
  with torch.no_grad():
98
- image_embedding = model.get_image_features(**inputs).numpy().flatten()
99
  return image_embedding.tolist()
100
  except Exception as e:
101
  print(f"Error generating image embedding: {e}")
 
1
  from fastapi import FastAPI
2
  import os
3
+ import fitz # PyMuPDF
4
  from pptx import Presentation
5
  from sentence_transformers import SentenceTransformer
6
  import torch
 
11
 
12
  app = FastAPI()
13
 
14
+ # Initialize ChromaDB with 512 dimensions
15
  client = chromadb.PersistentClient(path="/data/chroma_db")
16
+ collection = client.get_or_create_collection(name="knowledge_base", metadata={"hnsw:space": "cosine"}, embedding_function=None)
17
 
18
  # File Paths
19
  pdf_file = "Sutures and Suturing techniques.pdf"
20
  pptx_file = "impalnt 1.pptx"
21
 
22
  # Initialize Embedding Models
23
+ text_model = SentenceTransformer('paraphrase-MiniLM-L12-v2') # 512D text embeddings
24
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
25
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
26
 
27
  # Image Storage Folder
28
  IMAGE_FOLDER = "/data/extracted_images"
 
31
  # Extract Text from PDF
32
  def extract_text_from_pdf(pdf_path):
33
  try:
34
+ doc = fitz.open(pdf_path)
35
  text = " ".join(page.get_text() for page in doc)
36
  return text.strip() if text else None
37
  except Exception as e:
 
53
  # Extract Images from PDF
54
  def extract_images_from_pdf(pdf_path):
55
  try:
56
+ doc = fitz.open(pdf_path)
57
  images = []
58
  for i, page in enumerate(doc):
59
  for img_index, img in enumerate(page.get_images(full=True)):
 
93
  def get_image_embedding(image_path):
94
  try:
95
  image = Image.open(image_path)
96
+ inputs = clip_processor(images=image, return_tensors="pt")
97
  with torch.no_grad():
98
+ image_embedding = clip_model.get_image_features(**inputs).squeeze().numpy()
99
  return image_embedding.tolist()
100
  except Exception as e:
101
  print(f"Error generating image embedding: {e}")