Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,21 +18,6 @@ pptx_file="impalnt 1.pptx"
|
|
18 |
|
19 |
|
20 |
collection = client.get_collection(name="knowledge_base")
|
21 |
-
### Step 8: Process and Store from Files ###
|
22 |
-
def process_and_store(pdf_path=None, pptx_path=None):
|
23 |
-
texts, images = [], []
|
24 |
-
|
25 |
-
if pdf_path:
|
26 |
-
print(f"Processing PDF: {pdf_path}")
|
27 |
-
texts.append(extract_text_from_pdf(pdf_path))
|
28 |
-
images.extend(extract_images_from_pdf(pdf_path))
|
29 |
-
|
30 |
-
if pptx_path:
|
31 |
-
print(f"Processing PPTX: {pptx_path}")
|
32 |
-
texts.append(extract_text_from_pptx(pptx_path))
|
33 |
-
images.extend(extract_images_from_pptx(pptx_path))
|
34 |
-
|
35 |
-
store_data(texts, images)
|
36 |
|
37 |
# Initialize models
|
38 |
text_model = SentenceTransformer('all-MiniLM-L6-v2')
|
@@ -42,28 +27,6 @@ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
42 |
# Folder for extracted images
|
43 |
IMAGE_FOLDER = "/data/extracted_images"
|
44 |
os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
45 |
-
|
46 |
-
process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
|
47 |
-
@app.get("/")
|
48 |
-
def greet_json():
|
49 |
-
|
50 |
-
return {"Hello": "World!"}
|
51 |
-
|
52 |
-
@app.get("/test")
|
53 |
-
def greet_json():
|
54 |
-
return {"Hello": "Redmind!"}
|
55 |
-
|
56 |
-
@app.get("/search/")
|
57 |
-
def search(query: str):
|
58 |
-
query_embedding = get_text_embedding(query)
|
59 |
-
results = collection.query(
|
60 |
-
query_embeddings=[query_embedding],
|
61 |
-
n_results=5
|
62 |
-
)
|
63 |
-
return {"results": results["documents"]}
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
### Step 1: Extract Text from PDF ###
|
68 |
def extract_text_from_pdf(pdf_path):
|
69 |
text = ""
|
@@ -146,4 +109,42 @@ def store_data(texts, image_paths):
|
|
146 |
|
147 |
print("Data stored successfully!")
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
|
|
18 |
|
19 |
|
20 |
collection = client.get_collection(name="knowledge_base")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Initialize models
|
23 |
text_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
27 |
# Folder for extracted images
|
28 |
IMAGE_FOLDER = "/data/extracted_images"
|
29 |
os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
### Step 1: Extract Text from PDF ###
|
31 |
def extract_text_from_pdf(pdf_path):
|
32 |
text = ""
|
|
|
109 |
|
110 |
print("Data stored successfully!")
|
111 |
|
112 |
+
### Step 8: Process and Store from Files ###
|
113 |
+
def process_and_store(pdf_path=None, pptx_path=None):
|
114 |
+
texts, images = [], []
|
115 |
+
|
116 |
+
if pdf_path:
|
117 |
+
print(f"Processing PDF: {pdf_path}")
|
118 |
+
texts.append(extract_text_from_pdf(pdf_path))
|
119 |
+
images.extend(extract_images_from_pdf(pdf_path))
|
120 |
+
|
121 |
+
if pptx_path:
|
122 |
+
print(f"Processing PPTX: {pptx_path}")
|
123 |
+
texts.append(extract_text_from_pptx(pptx_path))
|
124 |
+
images.extend(extract_images_from_pptx(pptx_path))
|
125 |
+
|
126 |
+
store_data(texts, images)
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
|
131 |
+
@app.get("/")
|
132 |
+
def greet_json():
|
133 |
+
|
134 |
+
return {"Hello": "World!"}
|
135 |
+
|
136 |
+
@app.get("/test")
|
137 |
+
def greet_json():
|
138 |
+
return {"Hello": "Redmind!"}
|
139 |
+
|
140 |
+
@app.get("/search/")
|
141 |
+
def search(query: str):
|
142 |
+
query_embedding = get_text_embedding(query)
|
143 |
+
results = collection.query(
|
144 |
+
query_embeddings=[query_embedding],
|
145 |
+
n_results=5
|
146 |
+
)
|
147 |
+
return {"results": results["documents"]}
|
148 |
+
|
149 |
+
|
150 |
|