Redmind commited on
Commit
1cb21d5
·
verified ·
1 Parent(s): 275bb4e

Create app_copy.py

Browse files
Files changed (1) hide show
  1. app_copy.py +314 -0
app_copy.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import os
3
+ import pymupdf # PyMuPDF
4
+ from pptx import Presentation
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+ from transformers import CLIPProcessor, CLIPModel
8
+ from PIL import Image
9
+ import chromadb
10
+ import numpy as np
11
+ from sklearn.decomposition import PCA
12
+
13
+ app = FastAPI()
14
+
15
+ # Initialize ChromaDB
16
+ client = chromadb.PersistentClient(path="/data/chroma_db")
17
+ collection = client.get_or_create_collection(name="knowledge_base")
18
+
19
+ # File Paths
20
+ pdf_file = "Sutures and Suturing techniques.pdf"
21
+ pptx_file = "impalnt 1.pptx"
22
+
23
+ # Initialize Embedding Models
24
+ text_model = SentenceTransformer('all-MiniLM-L6-v2')
25
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
26
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
27
+
28
+ # Image Storage Folder
29
+ IMAGE_FOLDER = "/data/extracted_images"
30
+ os.makedirs(IMAGE_FOLDER, exist_ok=True)
31
+
32
+ # Extract Text from PDF
33
+ def extract_text_from_pdf(pdf_path):
34
+ try:
35
+ doc = pymupdf.open(pdf_path)
36
+ text = " ".join(page.get_text() for page in doc)
37
+ return text.strip() if text else None
38
+ except Exception as e:
39
+ print(f"Error extracting text from PDF: {e}")
40
+ return None
41
+
42
+ # Extract Text from PPTX
43
+ def extract_text_from_pptx(pptx_path):
44
+ try:
45
+ prs = Presentation(pptx_path)
46
+ text = " ".join(
47
+ shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")
48
+ )
49
+ return text.strip() if text else None
50
+ except Exception as e:
51
+ print(f"Error extracting text from PPTX: {e}")
52
+ return None
53
+
54
+ # Extract Images from PDF
55
+ def extract_images_from_pdf(pdf_path):
56
+ try:
57
+ doc = pymupdf.open(pdf_path)
58
+ images = []
59
+ for i, page in enumerate(doc):
60
+ for img_index, img in enumerate(page.get_images(full=True)):
61
+ xref = img[0]
62
+ image = doc.extract_image(xref)
63
+ img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{image['ext']}"
64
+ with open(img_path, "wb") as f:
65
+ f.write(image["image"])
66
+ images.append(img_path)
67
+ return images
68
+ except Exception as e:
69
+ print(f"Error extracting images from PDF: {e}")
70
+ return []
71
+
72
+ # Extract Images from PPTX
73
+ def extract_images_from_pptx(pptx_path):
74
+ try:
75
+ images = []
76
+ prs = Presentation(pptx_path)
77
+ for i, slide in enumerate(prs.slides):
78
+ for shape in slide.shapes:
79
+ if shape.shape_type == 13:
80
+ img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{shape.image.ext}"
81
+ with open(img_path, "wb") as f:
82
+ f.write(shape.image.blob)
83
+ images.append(img_path)
84
+ return images
85
+ except Exception as e:
86
+ print(f"Error extracting images from PPTX: {e}")
87
+ return []
88
+
89
+ # Convert Text to Embeddings
90
+ def get_text_embedding(text):
91
+ return text_model.encode(text).tolist()
92
+
93
+ # Preload PCA instance globally (to maintain consistency across calls)
94
+ pca = PCA(n_components=384)
95
+
96
+ def get_image_embedding(image_path):
97
+ try:
98
+ # Load the image
99
+ image = Image.open(image_path)
100
+ inputs = processor(images=image, return_tensors="pt")
101
+
102
+ # Extract image embeddings
103
+ with torch.no_grad():
104
+ image_embedding = model.get_image_features(**inputs).numpy().flatten()
105
+
106
+ # Print the actual embedding dimension
107
+ print(f"Image embedding shape: {image_embedding.shape}")
108
+
109
+ """ # CASE 1: Embedding is already 384-dimensional ✅
110
+ if len(image_embedding) == 384:
111
+ return image_embedding.tolist()
112
+
113
+ # CASE 2: Embedding is larger than 384 (e.g., 512) → Apply PCA ✅
114
+ elif len(image_embedding) > 384:
115
+
116
+ pca = PCA(n_components=384, svd_solver='auto') # Auto solver for stability
117
+ image_embedding = pca.fit_transform(image_embedding.reshape(1, -1)).flatten()
118
+ print(f"Reduced image embedding shape: {image_embedding.shape}")
119
+
120
+
121
+ # CASE 3: Embedding is smaller than 384 → Apply Padding ❌
122
+ else:
123
+ padding = np.zeros(384 - len(image_embedding)) # Create padding vector
124
+ image_embedding = np.concatenate((image_embedding, padding)) # Append padding"""
125
+ # Truncate to 384 dimensions
126
+ image_embedding = image_embedding[:384]
127
+
128
+ # Print the final embedding shape
129
+ print(f"Final Image embedding shape: {image_embedding.shape}")
130
+
131
+ return image_embedding.tolist()
132
+
133
+ except Exception as e:
134
+ print(f"❌ Error generating image embedding: {e}")
135
+ return None
136
+
137
+ # Store Data in ChromaDB
138
+ def store_data(texts, image_paths):
139
+ for i, text in enumerate(texts):
140
+ if text:
141
+ text_embedding = get_text_embedding(text)
142
+ if len(text_embedding) == 384:
143
+ collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
144
+
145
+ all_embeddings = [get_image_embedding(img_path) for img_path in image_paths if get_image_embedding(img_path) is not None]
146
+
147
+ if all_embeddings:
148
+ all_embeddings = np.array(all_embeddings)
149
+
150
+ # Apply PCA only if necessary
151
+ if all_embeddings.shape[1] != 384:
152
+ pca = PCA(n_components=384)
153
+ all_embeddings = pca.fit_transform(all_embeddings)
154
+
155
+ for j, img_path in enumerate(image_paths):
156
+ collection.add(ids=[f"image_{j}"], embeddings=[all_embeddings[j].tolist()], documents=[img_path])
157
+
158
+ print("Data stored successfully!")
159
+
160
+ # Process and Store from Files
161
+ def process_and_store(pdf_path=None, pptx_path=None):
162
+ texts, images = [], []
163
+ if pdf_path:
164
+ pdf_text = extract_text_from_pdf(pdf_path)
165
+ if pdf_text:
166
+ texts.append(pdf_text)
167
+ images.extend(extract_images_from_pdf(pdf_path))
168
+ if pptx_path:
169
+ pptx_text = extract_text_from_pptx(pptx_path)
170
+ if pptx_text:
171
+ texts.append(pptx_text)
172
+ images.extend(extract_images_from_pptx(pptx_path))
173
+ store_data(texts, images)
174
+
175
+
176
+
177
+ # FastAPI Endpoints
178
+ @app.get("/")
179
+ def greet_json():
180
+ # Run Data Processing
181
+ process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
182
+ return {"Document store": "created!"}
183
+
184
+ @app.get("/retrieval")
185
+ def retrieval(query: str):
186
+ try:
187
+ query_embedding = get_text_embedding(query)
188
+ results = collection.query(query_embeddings=[query_embedding], n_results=5)
189
+ #return {"results": results.get("documents", [])}
190
+ # Set a similarity threshold (adjust as needed)
191
+ SIMILARITY_THRESHOLD = 0.7
192
+
193
+ # Extract documents and similarity scores
194
+ documents = results.get("documents", [[]])[0] # Ensure we get the first list
195
+ distances = results.get("distances", [[]])[0] # Ensure we get the first list
196
+
197
+ # Filter results based on similarity threshold
198
+ filtered_results = [
199
+ doc for doc, score in zip(documents, distances) if score >= SIMILARITY_THRESHOLD
200
+ ]
201
+
202
+ # Return filtered results or indicate no match found
203
+ if filtered_results:
204
+ return {"results": filtered_results}
205
+ else:
206
+ return {"results": "No relevant match found in ChromaDB."}
207
+ except Exception as e:
208
+ return {"error": str(e)}
209
+
210
+ import pandas as pd
211
+ from io import StringIO
212
+ import os
213
+ import base64
214
+ @app.get("/save_file_dify")
215
+ def save_file_dify(csv_data: str):
216
+
217
+ # Split into lines
218
+ lines = csv_data.split("\n")
219
+
220
+ # Find the max number of columns
221
+ max_cols = max(line.count(",") + 1 for line in lines if line.strip())
222
+
223
+ # Normalize all rows to have the same number of columns
224
+ fixed_lines = [line + "," * (max_cols - line.count(",") - 1) for line in lines]
225
+
226
+ # Reconstruct CSV string
227
+ fixed_csv_data = "\n".join(fixed_lines)
228
+
229
+ # Convert CSV string to DataFrame
230
+ df = pd.read_csv(StringIO(fixed_csv_data))
231
+
232
+
233
+ #save in dify dataset and return download link
234
+ download_link = get_download_link_dify(df)
235
+
236
+ return download_link
237
+
238
+
239
+ def get_download_link_dify(df):
240
+ # code to save file in dify framework
241
+ import requests
242
+
243
+ # API Configuration
244
+ BASE_URL = "http://redmindgpt.redmindtechnologies.com:81/v1"
245
+ DATASET_ID = "084ae979-d101-414b-8854-9bbf5d3a442e"
246
+ API_KEY = "dataset-feqz5KrqHkFRdWbh2DInt58L"
247
+
248
+ dataset_name = 'output_dataset'
249
+ # Endpoint URL
250
+ url = f"{BASE_URL}/datasets/{DATASET_ID}/document/create-by-file"
251
+ print(url)
252
+ # Headers
253
+ headers = {
254
+ "Authorization": f"Bearer {API_KEY}"
255
+ }
256
+
257
+ # Data payload (form data as a plain text string)
258
+ data_payload = {
259
+ "data": """
260
+ {
261
+ "indexing_technique": "high_quality",
262
+ "process_rule": {
263
+ "rules": {
264
+ "pre_processing_rules": [
265
+ {"id": "remove_extra_spaces", "enabled": true},
266
+ {"id": "remove_urls_emails", "enabled": true}
267
+ ],
268
+ "segmentation": {
269
+ "separator": "###",
270
+ "max_tokens": 500
271
+ }
272
+ },
273
+ "mode": "custom"
274
+ }
275
+ }
276
+ """
277
+ }
278
+
279
+ # Convert DataFrame to binary (in-memory)
280
+ file_buffer = dataframe_to_binary(df)
281
+
282
+ files = {
283
+ "file": ("output.xlsx", file_buffer, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
284
+ }
285
+
286
+ # Send the POST request
287
+ response = requests.post(url, headers=headers, data=data_payload, files=files)
288
+ print(response)
289
+ data = response.json()
290
+ document_id = data['document']['id']
291
+
292
+ # code to get download_url
293
+ url = f"http://redmindgpt.redmindtechnologies.com:81/v1/datasets/{DATASET_ID}/documents/{document_id}/upload-file"
294
+
295
+ response = requests.get(url, headers=headers)
296
+ print(response)
297
+
298
+ download_url = response.json().get("download_url")
299
+ download_url = download_url.replace("download/","")
300
+ return download_url
301
+
302
+ def dataframe_to_binary(df):
303
+ import io
304
+ # Create a BytesIO stream
305
+ output = io.BytesIO()
306
+
307
+ # Write the DataFrame to this in-memory buffer as an Excel file
308
+ df.to_excel(output, index=False, engine="openpyxl")
309
+
310
+ # Move the cursor to the beginning of the stream
311
+ output.seek(0)
312
+
313
+ return output
314
+