Redmind commited on
Commit
b2cfabe
·
verified ·
1 Parent(s): dae15f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -2
app.py CHANGED
@@ -1,11 +1,100 @@
1
  from fastapi import FastAPI
2
 
3
  app = FastAPI()
4
-
 
5
  @app.get("/")
6
  def greet_json():
7
  return {"Hello": "World!"}
8
 
9
  @app.get("/test")
10
  def greet_json():
11
- return {"Hello": "Redmind!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
 
3
  app = FastAPI()
4
+ client = chromadb.PersistentClient(path="./chroma_db")
5
+ collection = client.get_collection(name="knowledge_base")
6
  @app.get("/")
7
  def greet_json():
8
  return {"Hello": "World!"}
9
 
10
  @app.get("/test")
11
  def greet_json():
12
+ return {"Hello": "Redmind!"}
13
+
14
+ @app.get("/search/")
15
+ def search(query: str):
16
+ query_embedding = get_text_embedding(query)
17
+ results = collection.query(
18
+ query_embeddings=[query_embedding],
19
+ n_results=5
20
+ )
21
+ return {"results": results["documents"]}
22
+
23
+ import fitz
24
+
25
+ def extract_text_from_pdf(pdf_path):
26
+ text = ""
27
+ doc = fitz.open(pdf_path)
28
+ for page in doc:
29
+ text += page.get_text() + "\n"
30
+ return text
31
+
32
+ from pptx import Presentation
33
+
34
+ def extract_text_from_pptx(pptx_path):
35
+ text = ""
36
+ prs = Presentation(pptx_path)
37
+ for slide in prs.slides:
38
+ for shape in slide.shapes:
39
+ if hasattr(shape, "text"):
40
+ text += shape.text + "\n"
41
+ return text
42
+
43
+ import os
44
+
45
+ def extract_images_from_pdf(pdf_path, output_folder):
46
+ doc = fitz.open(pdf_path)
47
+ os.makedirs(output_folder, exist_ok=True)
48
+ for i, page in enumerate(doc):
49
+ for img_index, img in enumerate(page.get_images(full=True)):
50
+ xref = img[0]
51
+ image = doc.extract_image(xref)
52
+ img_bytes = image["image"]
53
+ img_ext = image["ext"]
54
+ with open(f"{output_folder}/image_{i}_{img_index}.{img_ext}", "wb") as f:
55
+ f.write(img_bytes)
56
+
57
+ def extract_images_from_pptx(pptx_path, output_folder):
58
+ os.makedirs(output_folder, exist_ok=True)
59
+ prs = Presentation(pptx_path)
60
+ for i, slide in enumerate(prs.slides):
61
+ for shape in slide.shapes:
62
+ if shape.shape_type == 13: # Picture shape type
63
+ image = shape.image
64
+ img_bytes = image.blob
65
+ img_ext = image.ext
66
+ with open(f"{output_folder}/image_{i}.{img_ext}", "wb") as f:
67
+ f.write(img_bytes)
68
+ from sentence_transformers import SentenceTransformer
69
+
70
+ model = SentenceTransformer('all-MiniLM-L6-v2')
71
+
72
+ def get_text_embedding(text):
73
+ return model.encode(text).tolist()
74
+ from PIL import Image
75
+ import torch
76
+ from transformers import CLIPProcessor, CLIPModel
77
+
78
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
79
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
80
+
81
+ def get_image_embedding(image_path):
82
+ image = Image.open(image_path)
83
+ inputs = processor(images=image, return_tensors="pt")
84
+ with torch.no_grad():
85
+ embedding = clip_model.get_image_features(**inputs)
86
+ return embedding.squeeze().tolist()
87
+ import chromadb
88
+
89
+ client = chromadb.PersistentClient(path="./chroma_db")
90
+ collection = client.get_or_create_collection(name="knowledge_base")
91
+
92
+ def store_data(texts, images):
93
+ for i, text in enumerate(texts):
94
+ text_embedding = get_text_embedding(text)
95
+ collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
96
+
97
+ for j, image in enumerate(images):
98
+ image_embedding = get_image_embedding(image)
99
+ collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image])
100
+