cogcorp commited on
Commit
3aae288
·
1 Parent(s): 9df8c97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -9,6 +9,7 @@ import shutil
9
  import os
10
  from sklearn.neighbors import NearestNeighbors
11
  from tempfile import NamedTemporaryFile
 
12
 
13
  openAI_key = os.environ['OpenAPI']
14
 
@@ -47,6 +48,19 @@ class SemanticSearch:
47
  embeddings = np.vstack(embeddings)
48
  return embeddings
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def unique_filename(basename):
51
  # Append a unique ID to the end of the filename, before the extension
52
  base, ext = os.path.splitext(basename)
 
9
  import os
10
  from sklearn.neighbors import NearestNeighbors
11
  from tempfile import NamedTemporaryFile
12
+ from PyPDF2 import PdfFileReader
13
 
14
  openAI_key = os.environ['OpenAPI']
15
 
 
48
  embeddings = np.vstack(embeddings)
49
  return embeddings
50
 
51
+ def pdf_to_text(pdf_path, start_page=1):
52
+ pdf = PdfFileReader(open(pdf_path, "rb"))
53
+ text = ""
54
+ for page_num in range(start_page, pdf.getNumPages()):
55
+ text += pdf.getPage(page_num).extractText()
56
+ return text
57
+
58
+ def text_to_chunks(text, start_page=1, chunk_size=512):
59
+ chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
60
+ return chunks
61
+
62
+
63
+
64
  def unique_filename(basename):
65
  # Append a unique ID to the end of the filename, before the extension
66
  base, ext = os.path.splitext(basename)