Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ load_dotenv()
|
|
7 |
|
8 |
from flask import Flask, jsonify, render_template, request
|
9 |
import requests, json
|
|
|
10 |
|
11 |
# import nltk
|
12 |
# nltk.download("punkt")
|
@@ -100,6 +101,27 @@ def clearKBUploadDirectory(uploads_dir):
|
|
100 |
except Exception as e:
|
101 |
print('Failed to delete %s. Reason: %s' % (file_path, e))
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
def loadKB(fileprovided, urlProvided, uploads_dir, request):
|
105 |
documents = []
|
|
|
7 |
|
8 |
from flask import Flask, jsonify, render_template, request
|
9 |
import requests, json
|
10 |
+
import PyPDF2
|
11 |
|
12 |
# import nltk
|
13 |
# nltk.download("punkt")
|
|
|
101 |
except Exception as e:
|
102 |
print('Failed to delete %s. Reason: %s' % (file_path, e))
|
103 |
|
104 |
+
def PDFChunkerWithSeparator(filepath, separator):
|
105 |
+
# creating a pdf reader object
|
106 |
+
reader = PyPDF2.PdfReader(filepath)
|
107 |
+
|
108 |
+
# print the number of pages in pdf file
|
109 |
+
print(len(reader.pages))
|
110 |
+
content = ""
|
111 |
+
for page in reader.pages:
|
112 |
+
content += page.extract_text()
|
113 |
+
|
114 |
+
splitted_content_list = content.split(separator)
|
115 |
+
|
116 |
+
doclist = []
|
117 |
+
for splitted_content in splitted_content_list:
|
118 |
+
new_doc = Document(page_content=splitted_content, metadata={"source": filepath})
|
119 |
+
# print(type(new_doc))
|
120 |
+
doclist.append(new_doc)
|
121 |
+
if len(doclist)>3:
|
122 |
+
print(doclist[len(doclist) - 3])
|
123 |
+
return doclist
|
124 |
+
|
125 |
|
126 |
def loadKB(fileprovided, urlProvided, uploads_dir, request):
|
127 |
documents = []
|