Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ import shutil
|
|
9 |
import os
|
10 |
from sklearn.neighbors import NearestNeighbors
|
11 |
from tempfile import NamedTemporaryFile
|
|
|
12 |
|
13 |
openAI_key = os.environ['OpenAPI']
|
14 |
|
@@ -47,6 +48,19 @@ class SemanticSearch:
|
|
47 |
embeddings = np.vstack(embeddings)
|
48 |
return embeddings
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def unique_filename(basename):
|
51 |
# Append a unique ID to the end of the filename, before the extension
|
52 |
base, ext = os.path.splitext(basename)
|
|
|
9 |
import os
|
10 |
from sklearn.neighbors import NearestNeighbors
|
11 |
from tempfile import NamedTemporaryFile
|
12 |
+
from PyPDF2 import PdfFileReader
|
13 |
|
14 |
openAI_key = os.environ['OpenAPI']
|
15 |
|
|
|
48 |
embeddings = np.vstack(embeddings)
|
49 |
return embeddings
|
50 |
|
51 |
+
def pdf_to_text(pdf_path, start_page=1):
|
52 |
+
pdf = PdfFileReader(open(pdf_path, "rb"))
|
53 |
+
text = ""
|
54 |
+
for page_num in range(start_page, pdf.getNumPages()):
|
55 |
+
text += pdf.getPage(page_num).extractText()
|
56 |
+
return text
|
57 |
+
|
58 |
+
def text_to_chunks(text, start_page=1, chunk_size=512):
|
59 |
+
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
60 |
+
return chunks
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
def unique_filename(basename):
|
65 |
# Append a unique ID to the end of the filename, before the extension
|
66 |
base, ext = os.path.splitext(basename)
|