Redmind commited on
Commit
4314dbc
·
verified ·
1 Parent(s): 6597a2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -37
app.py CHANGED
@@ -18,21 +18,6 @@ pptx_file="impalnt 1.pptx"
18
 
19
 
20
  collection = client.get_collection(name="knowledge_base")
21
- ### Step 8: Process and Store from Files ###
22
- def process_and_store(pdf_path=None, pptx_path=None):
23
- texts, images = [], []
24
-
25
- if pdf_path:
26
- print(f"Processing PDF: {pdf_path}")
27
- texts.append(extract_text_from_pdf(pdf_path))
28
- images.extend(extract_images_from_pdf(pdf_path))
29
-
30
- if pptx_path:
31
- print(f"Processing PPTX: {pptx_path}")
32
- texts.append(extract_text_from_pptx(pptx_path))
33
- images.extend(extract_images_from_pptx(pptx_path))
34
-
35
- store_data(texts, images)
36
 
37
  # Initialize models
38
  text_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -42,28 +27,6 @@ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
42
  # Folder for extracted images
43
  IMAGE_FOLDER = "/data/extracted_images"
44
  os.makedirs(IMAGE_FOLDER, exist_ok=True)
45
-
46
- process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
47
- @app.get("/")
48
- def greet_json():
49
-
50
- return {"Hello": "World!"}
51
-
52
- @app.get("/test")
53
- def greet_json():
54
- return {"Hello": "Redmind!"}
55
-
56
- @app.get("/search/")
57
- def search(query: str):
58
- query_embedding = get_text_embedding(query)
59
- results = collection.query(
60
- query_embeddings=[query_embedding],
61
- n_results=5
62
- )
63
- return {"results": results["documents"]}
64
-
65
-
66
-
67
  ### Step 1: Extract Text from PDF ###
68
  def extract_text_from_pdf(pdf_path):
69
  text = ""
@@ -146,4 +109,42 @@ def store_data(texts, image_paths):
146
 
147
  print("Data stored successfully!")
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
 
18
 
19
 
20
  collection = client.get_collection(name="knowledge_base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Initialize models
23
  text_model = SentenceTransformer('all-MiniLM-L6-v2')
 
27
  # Folder for extracted images
28
  IMAGE_FOLDER = "/data/extracted_images"
29
  os.makedirs(IMAGE_FOLDER, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  ### Step 1: Extract Text from PDF ###
31
  def extract_text_from_pdf(pdf_path):
32
  text = ""
 
109
 
110
  print("Data stored successfully!")
111
 
112
+ ### Step 8: Process and Store from Files ###
113
+ def process_and_store(pdf_path=None, pptx_path=None):
114
+ texts, images = [], []
115
+
116
+ if pdf_path:
117
+ print(f"Processing PDF: {pdf_path}")
118
+ texts.append(extract_text_from_pdf(pdf_path))
119
+ images.extend(extract_images_from_pdf(pdf_path))
120
+
121
+ if pptx_path:
122
+ print(f"Processing PPTX: {pptx_path}")
123
+ texts.append(extract_text_from_pptx(pptx_path))
124
+ images.extend(extract_images_from_pptx(pptx_path))
125
+
126
+ store_data(texts, images)
127
+
128
+
129
+
130
+ process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
131
+ @app.get("/")
132
+ def greet_json():
133
+
134
+ return {"Hello": "World!"}
135
+
136
+ @app.get("/test")
137
+ def greet_json():
138
+ return {"Hello": "Redmind!"}
139
+
140
+ @app.get("/search/")
141
+ def search(query: str):
142
+ query_embedding = get_text_embedding(query)
143
+ results = collection.query(
144
+ query_embeddings=[query_embedding],
145
+ n_results=5
146
+ )
147
+ return {"results": results["documents"]}
148
+
149
+
150