Redmind commited on
Commit
a928ae7
·
verified ·
1 Parent(s): 416df2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -28
app.py CHANGED
@@ -1,10 +1,34 @@
1
  from fastapi import FastAPI
 
 
 
 
 
 
 
 
 
2
 
3
  app = FastAPI()
4
- client = chromadb.PersistentClient(path="./chroma_db")
5
  collection = client.get_collection(name="knowledge_base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  @app.get("/")
7
  def greet_json():
 
8
  return {"Hello": "World!"}
9
 
10
  @app.get("/test")
@@ -20,17 +44,18 @@ def search(query: str):
20
  )
21
  return {"results": results["documents"]}
22
 
23
- import fitz
24
 
 
 
25
  def extract_text_from_pdf(pdf_path):
26
  text = ""
27
  doc = fitz.open(pdf_path)
28
  for page in doc:
29
  text += page.get_text() + "\n"
30
- return text
31
 
32
- from pptx import Presentation
33
 
 
34
  def extract_text_from_pptx(pptx_path):
35
  text = ""
36
  prs = Presentation(pptx_path)
@@ -38,24 +63,29 @@ def extract_text_from_pptx(pptx_path):
38
  for shape in slide.shapes:
39
  if hasattr(shape, "text"):
40
  text += shape.text + "\n"
41
- return text
42
 
43
- import os
44
 
45
- def extract_images_from_pdf(pdf_path, output_folder):
 
 
46
  doc = fitz.open(pdf_path)
47
- os.makedirs(output_folder, exist_ok=True)
48
  for i, page in enumerate(doc):
49
  for img_index, img in enumerate(page.get_images(full=True)):
50
  xref = img[0]
51
  image = doc.extract_image(xref)
52
  img_bytes = image["image"]
53
  img_ext = image["ext"]
54
- with open(f"{output_folder}/image_{i}_{img_index}.{img_ext}", "wb") as f:
 
55
  f.write(img_bytes)
 
 
 
56
 
57
- def extract_images_from_pptx(pptx_path, output_folder):
58
- os.makedirs(output_folder, exist_ok=True)
 
59
  prs = Presentation(pptx_path)
60
  for i, slide in enumerate(prs.slides):
61
  for shape in slide.shapes:
@@ -63,38 +93,54 @@ def extract_images_from_pptx(pptx_path, output_folder):
63
  image = shape.image
64
  img_bytes = image.blob
65
  img_ext = image.ext
66
- with open(f"{output_folder}/image_{i}.{img_ext}", "wb") as f:
 
67
  f.write(img_bytes)
68
- from sentence_transformers import SentenceTransformer
 
69
 
70
- model = SentenceTransformer('all-MiniLM-L6-v2')
71
 
 
72
  def get_text_embedding(text):
73
- return model.encode(text).tolist()
74
- from PIL import Image
75
- import torch
76
- from transformers import CLIPProcessor, CLIPModel
77
 
78
- clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
79
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
80
 
 
81
  def get_image_embedding(image_path):
82
  image = Image.open(image_path)
83
- inputs = processor(images=image, return_tensors="pt")
84
  with torch.no_grad():
85
  embedding = clip_model.get_image_features(**inputs)
86
  return embedding.squeeze().tolist()
87
- import chromadb
88
 
89
- client = chromadb.PersistentClient(path="./chroma_db")
90
- collection = client.get_or_create_collection(name="knowledge_base")
91
 
92
- def store_data(texts, images):
 
 
93
  for i, text in enumerate(texts):
94
  text_embedding = get_text_embedding(text)
95
  collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
96
 
97
- for j, image in enumerate(images):
98
- image_embedding = get_image_embedding(image)
99
- collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
 
 
1
  from fastapi import FastAPI
2
+ import os
3
+ import fitz # PyMuPDF for PDFs
4
+ from pptx import Presentation # python-pptx for PowerPoint
5
+ from sentence_transformers import SentenceTransformer # Text embeddings
6
+ import torch
7
+ from transformers import CLIPProcessor, CLIPModel # Image embeddings
8
+ from PIL import Image
9
+ import chromadb
10
+
11
 
12
  app = FastAPI()
13
+ client = chromadb.PersistentClient(path="/data/chroma_db")
14
  collection = client.get_collection(name="knowledge_base")
15
+ pdf_file="Sutures and Suturing techniques.pdf"
16
+ pptx_file="impalnt 1.pptx"
17
+ process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
18
+
19
+ # Initialize models
20
+ text_model = SentenceTransformer('all-MiniLM-L6-v2')
21
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
22
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
23
+
24
+ # Folder for extracted images
25
+ IMAGE_FOLDER = "/data/extracted_images"
26
+ os.makedirs(IMAGE_FOLDER, exist_ok=True)
27
+
28
+
29
  @app.get("/")
30
  def greet_json():
31
+
32
  return {"Hello": "World!"}
33
 
34
  @app.get("/test")
 
44
  )
45
  return {"results": results["documents"]}
46
 
 
47
 
48
+
49
+ ### Step 1: Extract Text from PDF ###
50
  def extract_text_from_pdf(pdf_path):
51
  text = ""
52
  doc = fitz.open(pdf_path)
53
  for page in doc:
54
  text += page.get_text() + "\n"
55
+ return text.strip()
56
 
 
57
 
58
+ ### Step 2: Extract Text from PowerPoint ###
59
  def extract_text_from_pptx(pptx_path):
60
  text = ""
61
  prs = Presentation(pptx_path)
 
63
  for shape in slide.shapes:
64
  if hasattr(shape, "text"):
65
  text += shape.text + "\n"
66
+ return text.strip()
67
 
 
68
 
69
+ ### Step 3: Extract Images from PDF ###
70
+ def extract_images_from_pdf(pdf_path):
71
+ images = []
72
  doc = fitz.open(pdf_path)
 
73
  for i, page in enumerate(doc):
74
  for img_index, img in enumerate(page.get_images(full=True)):
75
  xref = img[0]
76
  image = doc.extract_image(xref)
77
  img_bytes = image["image"]
78
  img_ext = image["ext"]
79
+ img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}"
80
+ with open(img_path, "wb") as f:
81
  f.write(img_bytes)
82
+ images.append(img_path)
83
+ return images
84
+
85
 
86
+ ### Step 4: Extract Images from PowerPoint ###
87
+ def extract_images_from_pptx(pptx_path):
88
+ images = []
89
  prs = Presentation(pptx_path)
90
  for i, slide in enumerate(prs.slides):
91
  for shape in slide.shapes:
 
93
  image = shape.image
94
  img_bytes = image.blob
95
  img_ext = image.ext
96
+ img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}"
97
+ with open(img_path, "wb") as f:
98
  f.write(img_bytes)
99
+ images.append(img_path)
100
+ return images
101
 
 
102
 
103
+ ### Step 5: Convert Text to Embeddings ###
104
  def get_text_embedding(text):
105
+ return text_model.encode(text).tolist()
 
 
 
106
 
 
 
107
 
108
+ ### Step 6: Convert Images to Embeddings ###
109
  def get_image_embedding(image_path):
110
  image = Image.open(image_path)
111
+ inputs = clip_processor(images=image, return_tensors="pt")
112
  with torch.no_grad():
113
  embedding = clip_model.get_image_features(**inputs)
114
  return embedding.squeeze().tolist()
 
115
 
 
 
116
 
117
+ ### Step 7: Store Data in ChromaDB ###
118
+ def store_data(texts, image_paths):
119
+ # Store text embeddings
120
  for i, text in enumerate(texts):
121
  text_embedding = get_text_embedding(text)
122
  collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
123
 
124
+ # Store image embeddings
125
+ for j, image_path in enumerate(image_paths):
126
+ image_embedding = get_image_embedding(image_path)
127
+ collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path])
128
+
129
+ print("Data stored successfully!")
130
+
131
+
132
+ ### Step 8: Process and Store from Files ###
133
+ def process_and_store(pdf_path=None, pptx_path=None):
134
+ texts, images = [], []
135
+
136
+ if pdf_path:
137
+ print(f"Processing PDF: {pdf_path}")
138
+ texts.append(extract_text_from_pdf(pdf_path))
139
+ images.extend(extract_images_from_pdf(pdf_path))
140
+
141
+ if pptx_path:
142
+ print(f"Processing PPTX: {pptx_path}")
143
+ texts.append(extract_text_from_pptx(pptx_path))
144
+ images.extend(extract_images_from_pptx(pptx_path))
145
 
146
+ store_data(texts, images)