Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -24,10 +24,11 @@ index = faiss.IndexFlatL2(vector_dim) # FAISS index
|
|
24 |
|
25 |
documents = [] # Store extracted text
|
26 |
|
27 |
-
def extract_text_from_pdf(
|
28 |
-
"""Extracts text from PDF"""
|
29 |
-
doc = fitz.open(
|
30 |
text_chunks = [page.get_text("text") for page in doc]
|
|
|
31 |
return text_chunks
|
32 |
|
33 |
def create_vector_db(text_chunks):
|
@@ -96,26 +97,30 @@ def index():
|
|
96 |
"""Serve the HTML page for the user interface"""
|
97 |
return render_template('index.html')
|
98 |
|
99 |
-
UPLOAD_FOLDER = "/tmp/uploaded_files"
|
100 |
-
os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Ensure the folder exists
|
101 |
-
|
102 |
@app.route('/upload_pdf', methods=['POST'])
|
103 |
def upload_pdf():
|
104 |
"""Handle PDF upload"""
|
105 |
if 'pdf' not in request.files:
|
106 |
-
return jsonify({"error": "No file part"}), 400
|
107 |
|
108 |
file = request.files['pdf']
|
109 |
if file.filename == "":
|
110 |
-
return jsonify({"error": "No selected file"}), 400
|
111 |
-
|
112 |
-
pdf_path = os.path.join(UPLOAD_FOLDER, file.filename)
|
113 |
|
114 |
try:
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
create_vector_db(text_chunks)
|
120 |
|
121 |
return jsonify({"message": "PDF uploaded and indexed successfully!"}), 200
|
|
|
24 |
|
25 |
documents = [] # Store extracted text
|
26 |
|
27 |
+
def extract_text_from_pdf(pdf_stream):
|
28 |
+
"""Extracts text from PDF stream"""
|
29 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
30 |
text_chunks = [page.get_text("text") for page in doc]
|
31 |
+
doc.close()
|
32 |
return text_chunks
|
33 |
|
34 |
def create_vector_db(text_chunks):
|
|
|
97 |
"""Serve the HTML page for the user interface"""
|
98 |
return render_template('index.html')
|
99 |
|
|
|
|
|
|
|
100 |
@app.route('/upload_pdf', methods=['POST'])
|
101 |
def upload_pdf():
|
102 |
"""Handle PDF upload"""
|
103 |
if 'pdf' not in request.files:
|
104 |
+
return jsonify({"error": "No file part"}), 400
|
105 |
|
106 |
file = request.files['pdf']
|
107 |
if file.filename == "":
|
108 |
+
return jsonify({"error": "No selected file"}), 400
|
|
|
|
|
109 |
|
110 |
try:
|
111 |
+
# Read the file directly into memory instead of saving to disk
|
112 |
+
pdf_stream = file.read()
|
113 |
+
|
114 |
+
# Create a BytesIO object to work with the PDF in memory
|
115 |
+
from io import BytesIO
|
116 |
+
pdf_stream = BytesIO(pdf_stream)
|
117 |
+
|
118 |
+
# Use fitz to open the PDF from memory
|
119 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
120 |
+
text_chunks = [page.get_text("text") for page in doc]
|
121 |
+
doc.close()
|
122 |
+
|
123 |
+
# Create vector database
|
124 |
create_vector_db(text_chunks)
|
125 |
|
126 |
return jsonify({"message": "PDF uploaded and indexed successfully!"}), 200
|