Redmind commited on
Commit
bbe1084
·
verified ·
1 Parent(s): 5419df9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -91
app.py CHANGED
@@ -1,55 +1,40 @@
1
  from fastapi import FastAPI
2
  import os
3
-
4
  import pymupdf
5
- from pptx import Presentation # python-pptx for PowerPoint
6
  from sentence_transformers import SentenceTransformer # Text embeddings
7
  import torch
8
  from transformers import CLIPProcessor, CLIPModel # Image embeddings
9
  from PIL import Image
10
  import chromadb
11
-
 
12
 
13
  app = FastAPI()
14
  client = chromadb.PersistentClient(path="/data/chroma_db")
15
  collection = client.get_or_create_collection(name="knowledge_base")
16
 
17
- print("Created collection with 512 dimensions!")
18
-
19
- pdf_file="Sutures and Suturing techniques.pdf"
20
- pptx_file="impalnt 1.pptx"
21
-
22
-
23
- collection = client.get_collection(name="knowledge_base")
24
- print("Collection Embedding Dimension:", collection.metadata)
25
 
26
  # Initialize models
27
  text_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
28
 
29
- # Folder for extracted images
30
  IMAGE_FOLDER = "/data/extracted_images"
31
  os.makedirs(IMAGE_FOLDER, exist_ok=True)
32
- ### Step 1: Extract Text from PDF ###
 
33
  def extract_text_from_pdf(pdf_path):
34
- text = ""
35
- doc = pymupdf.open(pdf_path)
36
- for page in doc:
37
- text += page.get_text() + "\n"
38
  return text.strip()
39
 
40
-
41
- ### Step 2: Extract Text from PowerPoint ###
42
  def extract_text_from_pptx(pptx_path):
43
- text = ""
44
- prs = Presentation(pptx_path)
45
- for slide in prs.slides:
46
- for shape in slide.shapes:
47
- if hasattr(shape, "text"):
48
- text += shape.text + "\n"
49
- return text.strip()
50
-
51
 
52
- ### Step 3: Extract Images from PDF ###
53
  def extract_images_from_pdf(pdf_path):
54
  images = []
55
  doc = pymupdf.open(pdf_path)
@@ -57,102 +42,73 @@ def extract_images_from_pdf(pdf_path):
57
  for img_index, img in enumerate(page.get_images(full=True)):
58
  xref = img[0]
59
  image = doc.extract_image(xref)
60
- img_bytes = image["image"]
61
- img_ext = image["ext"]
62
- img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}"
63
  with open(img_path, "wb") as f:
64
- f.write(img_bytes)
65
  images.append(img_path)
66
  return images
67
 
68
-
69
- ### Step 4: Extract Images from PowerPoint ###
70
  def extract_images_from_pptx(pptx_path):
71
  images = []
72
  prs = Presentation(pptx_path)
73
  for i, slide in enumerate(prs.slides):
74
  for shape in slide.shapes:
75
- if shape.shape_type == 13: # Picture shape type
76
- image = shape.image
77
- img_bytes = image.blob
78
- img_ext = image.ext
79
- img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}"
80
  with open(img_path, "wb") as f:
81
- f.write(img_bytes)
82
  images.append(img_path)
83
  return images
84
 
85
-
86
- ### Step 5: Convert Text to Embeddings ###
87
  def get_text_embedding(text):
88
  return text_model.encode(text).tolist()
89
 
90
- from transformers import CLIPProcessor, CLIPModel
91
- import torch
92
- import numpy as np
93
- from sklearn.decomposition import PCA
94
-
95
- # ✅ Load CLIP (512-dimensional output)
96
- model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
97
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
98
-
99
  def get_image_embedding(image_path):
100
- """Extracts image embedding and reduces to 384 dimensions"""
101
- from PIL import Image
102
-
103
  image = Image.open(image_path)
104
  inputs = processor(images=image, return_tensors="pt")
105
-
106
  with torch.no_grad():
107
- image_embedding = model.get_image_features(**inputs) # Shape: (1, 512)
108
-
109
- image_embedding = image_embedding.numpy().flatten() # Convert to NumPy (512,)
110
 
111
- # Reduce to 384 dimensions using PCA
112
- pca = PCA(n_components=384)
113
- image_embedding_384 = pca.fit_transform(image_embedding.reshape(1, -1))
114
-
115
- return image_embedding_384.flatten().tolist()
116
-
117
-
118
-
119
- ### Step 7: Store Data in ChromaDB ###
120
  def store_data(texts, image_paths):
121
- # Store text embeddings
122
  for i, text in enumerate(texts):
123
- text_embedding = get_text_embedding(text)
124
- print("Embedding Dimension:", len(text_embedding))
125
- collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
126
 
127
- # Store image embeddings
128
- for j, image_path in enumerate(image_paths):
129
- image_embedding = get_image_embedding(image_path)
130
- collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path])
 
 
 
 
 
 
 
 
 
131
 
132
  print("Data stored successfully!")
133
 
134
- ### Step 8: Process and Store from Files ###
135
  def process_and_store(pdf_path=None, pptx_path=None):
136
  texts, images = [], []
137
-
138
  if pdf_path:
139
- print(f"Processing PDF: {pdf_path}")
140
  texts.append(extract_text_from_pdf(pdf_path))
141
  images.extend(extract_images_from_pdf(pdf_path))
142
-
143
  if pptx_path:
144
- print(f"Processing PPTX: {pptx_path}")
145
  texts.append(extract_text_from_pptx(pptx_path))
146
  images.extend(extract_images_from_pptx(pptx_path))
147
-
148
  store_data(texts, images)
149
 
150
-
151
-
152
  process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
 
153
  @app.get("/")
154
  def greet_json():
155
-
156
  return {"Hello": "World!"}
157
 
158
  @app.get("/test")
@@ -162,11 +118,5 @@ def greet_json():
162
  @app.get("/search/")
163
  def search(query: str):
164
  query_embedding = get_text_embedding(query)
165
- results = collection.query(
166
- query_embeddings=[query_embedding],
167
- n_results=5
168
- )
169
  return {"results": results["documents"]}
170
-
171
-
172
-
 
1
  from fastapi import FastAPI
2
  import os
 
3
  import pymupdf
4
+ from pptx import Presentation # PowerPoint
5
  from sentence_transformers import SentenceTransformer # Text embeddings
6
  import torch
7
  from transformers import CLIPProcessor, CLIPModel # Image embeddings
8
  from PIL import Image
9
  import chromadb
10
+ import numpy as np
11
+ from sklearn.decomposition import PCA
12
 
13
  app = FastAPI()
14
  client = chromadb.PersistentClient(path="/data/chroma_db")
15
  collection = client.get_or_create_collection(name="knowledge_base")
16
 
17
+ pdf_file = "Sutures and Suturing techniques.pdf"
18
+ pptx_file = "impalnt 1.pptx"
 
 
 
 
 
 
19
 
20
  # Initialize models
21
  text_model = SentenceTransformer('all-MiniLM-L6-v2')
22
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
23
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
24
 
 
25
  IMAGE_FOLDER = "/data/extracted_images"
26
  os.makedirs(IMAGE_FOLDER, exist_ok=True)
27
+
28
+ # Extract text from PDF
29
  def extract_text_from_pdf(pdf_path):
30
+ text = "".join([page.get_text() for page in pymupdf.open(pdf_path)])
 
 
 
31
  return text.strip()
32
 
33
+ # Extract text from PowerPoint
 
34
  def extract_text_from_pptx(pptx_path):
35
+ return "".join([shape.text for slide in Presentation(pptx_path).slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
 
 
 
 
 
 
 
36
 
37
+ # Extract images from PDF
38
  def extract_images_from_pdf(pdf_path):
39
  images = []
40
  doc = pymupdf.open(pdf_path)
 
42
  for img_index, img in enumerate(page.get_images(full=True)):
43
  xref = img[0]
44
  image = doc.extract_image(xref)
45
+ img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{image['ext']}"
 
 
46
  with open(img_path, "wb") as f:
47
+ f.write(image["image"])
48
  images.append(img_path)
49
  return images
50
 
51
+ # Extract images from PowerPoint
 
52
  def extract_images_from_pptx(pptx_path):
53
  images = []
54
  prs = Presentation(pptx_path)
55
  for i, slide in enumerate(prs.slides):
56
  for shape in slide.shapes:
57
+ if shape.shape_type == 13:
58
+ img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{shape.image.ext}"
 
 
 
59
  with open(img_path, "wb") as f:
60
+ f.write(shape.image.blob)
61
  images.append(img_path)
62
  return images
63
 
64
+ # Convert text to embeddings
 
65
  def get_text_embedding(text):
66
  return text_model.encode(text).tolist()
67
 
68
+ # Extract image embeddings
 
 
 
 
 
 
 
 
69
  def get_image_embedding(image_path):
 
 
 
70
  image = Image.open(image_path)
71
  inputs = processor(images=image, return_tensors="pt")
 
72
  with torch.no_grad():
73
+ image_embedding = model.get_image_features(**inputs).numpy().flatten()
74
+ return image_embedding.tolist()
 
75
 
76
+ # Store Data in ChromaDB
 
 
 
 
 
 
 
 
77
  def store_data(texts, image_paths):
 
78
  for i, text in enumerate(texts):
79
+ collection.add(ids=[f"text_{i}"], embeddings=[get_text_embedding(text)], documents=[text])
 
 
80
 
81
+ # Collect image embeddings first
82
+ all_embeddings = [get_image_embedding(img_path) for img_path in image_paths]
83
+ all_embeddings = np.array(all_embeddings)
84
+
85
+ # Apply PCA if enough images exist
86
+ if all_embeddings.shape[0] >= 384:
87
+ pca = PCA(n_components=384)
88
+ transformed_embeddings = pca.fit_transform(all_embeddings)
89
+ else:
90
+ transformed_embeddings = all_embeddings # Use original embeddings
91
+
92
+ for j, img_path in enumerate(image_paths):
93
+ collection.add(ids=[f"image_{j}"], embeddings=[transformed_embeddings[j].tolist()], documents=[img_path])
94
 
95
  print("Data stored successfully!")
96
 
97
+ # Process and store from files
98
  def process_and_store(pdf_path=None, pptx_path=None):
99
  texts, images = [], []
 
100
  if pdf_path:
 
101
  texts.append(extract_text_from_pdf(pdf_path))
102
  images.extend(extract_images_from_pdf(pdf_path))
 
103
  if pptx_path:
 
104
  texts.append(extract_text_from_pptx(pptx_path))
105
  images.extend(extract_images_from_pptx(pptx_path))
 
106
  store_data(texts, images)
107
 
 
 
108
  process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
109
+
110
  @app.get("/")
111
  def greet_json():
 
112
  return {"Hello": "World!"}
113
 
114
  @app.get("/test")
 
118
  @app.get("/search/")
119
  def search(query: str):
120
  query_embedding = get_text_embedding(query)
121
+ results = collection.query(query_embeddings=[query_embedding], n_results=5)
 
 
 
122
  return {"results": results["documents"]}