demoPOC commited on
Commit
02a16b9
·
1 Parent(s): b4c9fc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -0
app.py CHANGED
@@ -7,6 +7,7 @@ load_dotenv()
7
 
8
  from flask import Flask, jsonify, render_template, request
9
  import requests, json
 
10
 
11
  # import nltk
12
  # nltk.download("punkt")
@@ -100,6 +101,27 @@ def clearKBUploadDirectory(uploads_dir):
100
  except Exception as e:
101
  print('Failed to delete %s. Reason: %s' % (file_path, e))
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def loadKB(fileprovided, urlProvided, uploads_dir, request):
105
  documents = []
 
7
 
8
  from flask import Flask, jsonify, render_template, request
9
  import requests, json
10
+ import PyPDF2
11
 
12
  # import nltk
13
  # nltk.download("punkt")
 
101
  except Exception as e:
102
  print('Failed to delete %s. Reason: %s' % (file_path, e))
103
 
104
+ def PDFChunkerWithSeparator(filepath, separator):
105
+ # creating a pdf reader object
106
+ reader = PyPDF2.PdfReader(filepath)
107
+
108
+ # print the number of pages in pdf file
109
+ print(len(reader.pages))
110
+ content = ""
111
+ for page in reader.pages:
112
+ content += page.extract_text()
113
+
114
+ splitted_content_list = content.split(separator)
115
+
116
+ doclist = []
117
+ for splitted_content in splitted_content_list:
118
+ new_doc = Document(page_content=splitted_content, metadata={"source": filepath})
119
+ # print(type(new_doc))
120
+ doclist.append(new_doc)
121
+ if len(doclist)>3:
122
+ print(doclist[len(doclist) - 3])
123
+ return doclist
124
+
125
 
126
  def loadKB(fileprovided, urlProvided, uploads_dir, request):
127
  documents = []