File size: 3,193 Bytes
be3a2ca
 
 
b2cfabe
 
be3a2ca
 
dae15f6
 
 
 
b2cfabe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from fastapi import FastAPI

app = FastAPI()
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(name="knowledge_base")
@app.get("/")
def greet_json():
    return {"Hello": "World!"}

@app.get("/test")
def greet_json():
    return {"Hello": "Redmind!"}

@app.get("/search/")
def search(query: str):
    query_embedding = get_text_embedding(query)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )
    return {"results": results["documents"]}

import fitz

def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text() + "\n"
    return text

from pptx import Presentation

def extract_text_from_pptx(pptx_path):
    text = ""
    prs = Presentation(pptx_path)
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

import os

def extract_images_from_pdf(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    os.makedirs(output_folder, exist_ok=True)
    for i, page in enumerate(doc):
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            image = doc.extract_image(xref)
            img_bytes = image["image"]
            img_ext = image["ext"]
            with open(f"{output_folder}/image_{i}_{img_index}.{img_ext}", "wb") as f:
                f.write(img_bytes)

def extract_images_from_pptx(pptx_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    prs = Presentation(pptx_path)
    for i, slide in enumerate(prs.slides):
        for shape in slide.shapes:
            if shape.shape_type == 13:  # Picture shape type
                image = shape.image
                img_bytes = image.blob
                img_ext = image.ext
                with open(f"{output_folder}/image_{i}.{img_ext}", "wb") as f:
                    f.write(img_bytes)
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def get_text_embedding(text):
    return model.encode(text).tolist()
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embedding(image_path):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    return embedding.squeeze().tolist()
import chromadb

client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="knowledge_base")

def store_data(texts, images):
    for i, text in enumerate(texts):
        text_embedding = get_text_embedding(text)
        collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])

    for j, image in enumerate(images):
        image_embedding = get_image_embedding(image)
        collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image])