Pamudu13 commited on
Commit
f1a04ea
·
verified ·
1 Parent(s): 61467ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -76
app.py CHANGED
@@ -1,95 +1,292 @@
1
- from flask import Flask, jsonify, request, Response, stream_with_context
2
- from flask_cors import CORS
3
- import requests
4
- from bs4 import BeautifulSoup
 
 
 
 
5
  import os
6
- import re
7
- import urllib.parse
8
- import time
9
- import random
10
  import base64
11
- from io import BytesIO
12
- from googlesearch import search
13
- import logging
14
- import queue
15
- from huggingface_hub import HfApi
16
 
 
 
17
 
18
  app = Flask(__name__)
19
- # Enable CORS with specific settings
20
- CORS(app, resources={
21
- r"/*": {
22
- "origins": "*",
23
- "methods": ["GET", "POST", "OPTIONS"],
24
- "allow_headers": ["Content-Type", "Authorization"]
25
- }
26
- })
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
 
 
 
 
 
28
 
29
- HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment
 
 
 
 
 
30
 
31
- @app.route('/restart_space', methods=['POST'])
32
- def api_restart_space():
33
- """API route to restart a Hugging Face Space."""
34
- space_id = 'Pamudu13/web-scraper'
35
- factory_reboot = request.json.get('factory_reboot', False) # Optional: Set to True if you want a factory reboot
 
36
 
37
- if not space_id:
38
- return jsonify({'error': 'space_id parameter is required'}), 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  try:
41
- hfapi = HfApi()
 
 
 
 
 
 
42
 
43
- # Call the restart_space method
44
- res = hfapi.restart_space(
45
- space_id,
46
- token=HF_TOKEN,
47
- factory_reboot=factory_reboot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- return jsonify({
51
- 'success': True,
52
- 'message': f"Successfully restarted Space: {space_id}",
53
- 'response': res
54
- }), 200
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
- return jsonify({
58
- 'success': False,
59
- 'message': f"Error: {str(e)}"
60
- }), 500
61
-
62
- @app.route('/get_live_space_status', methods=['GET'])
63
- def get_live_space_status():
64
- """API route to stream live status of a Hugging Face Space."""
65
- space_id = request.args.get('space_id', 'Pamudu13/web-scraper') # Default to 'Pamudu13/web-scraper' if not provided
66
-
67
- def generate():
68
- while True:
69
- try:
70
- # Fetch the current runtime status of the Space
71
- hf_api = HfApi()
72
- space_runtime = hf_api.get_space_runtime(repo_id=space_id)
73
-
74
- # Extract relevant details
75
- status = space_runtime.stage # e.g., 'BUILDING', 'RUNNING', etc.
76
- hardware = space_runtime.hardware # e.g., 'cpu-basic', 't4-medium', etc.
77
-
78
- # Send the status as a Server-Sent Event
79
- yield f"data: {status}\n\n"
80
- yield f"data: {hardware}\n\n"
81
-
82
- # Delay before checking the status again
83
- time.sleep(5) # Adjust polling interval as needed
84
-
85
- except Exception as e:
86
- # Handle errors and send an error message
87
- yield f"data: Error: {str(e)}\n\n"
88
- break # Stop the stream in case of an error
89
-
90
- return Response(stream_with_context(generate()), mimetype='text/event-stream')
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  if __name__ == '__main__':
94
- logger.info("Starting Flask API server...")
95
- app.run(host='0.0.0.0', port=5001, debug=True)
 
1
+ from flask import Flask, request, jsonify
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.llms import HuggingFaceEndpoint
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.memory import ConversationBufferMemory
9
  import os
10
+ from dotenv import load_dotenv
11
+ from flask_cors import CORS
 
 
12
  import base64
13
+ import tempfile
14
+ import io
15
+ from pathlib import Path
 
 
16
 
17
+ # Load environment variables
18
+ load_dotenv()
19
 
20
  app = Flask(__name__)
21
+ CORS(app)
22
+
23
+ # Increase maximum content length to 32MB
24
+ app.config['MAX_CONTENT_LENGTH'] = 32 * 1024 * 1024
25
+
26
+ # Global variables
27
+ qa_chain = None
28
+ vector_db = None
29
+ api_token =os.getenv("HF_TOKEN")
30
+ pdf_chunks = {}
31
+ app.config['UPLOAD_FOLDER'] = 'temp_uploads'
32
+
33
+ # Create upload folder if it doesn't exist
34
+ Path(app.config['UPLOAD_FOLDER']).mkdir(parents=True, exist_ok=True)
35
+
36
+ # Available LLM models
37
+ LLM_MODELS = {
38
+ "llama": "meta-llama/Meta-Llama-3-8B-Instruct",
39
+ "mistral": "mistralai/Mistral-7B-Instruct-v0.2"
40
+ }
41
 
42
+ # Add these global variables
43
+ current_upload = {
44
+ 'filename': None,
45
+ 'chunks': [],
46
+ 'filesize': 0
47
+ }
48
 
49
+ def load_doc(file_paths):
50
+ """Load and split multiple PDF documents"""
51
+ loaders = [PyPDFLoader(path) for path in file_paths]
52
+ pages = []
53
+ for loader in loaders:
54
+ pages.extend(loader.load())
55
 
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1024,
58
+ chunk_overlap=64
59
+ )
60
+ doc_splits = text_splitter.split_documents(pages)
61
+ return doc_splits
62
 
63
+ def create_db(splits):
64
+ """Create vector database from document splits"""
65
+ embeddings = HuggingFaceEmbeddings()
66
+ vectordb = FAISS.from_documents(splits, embeddings)
67
+ return vectordb
68
+
69
+ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
70
+ """Initialize the LLM chain"""
71
+ llm = HuggingFaceEndpoint(
72
+ repo_id=llm_model,
73
+ huggingfacehub_api_token=api_token,
74
+ temperature=temperature,
75
+ max_new_tokens=max_tokens,
76
+ top_k=top_k,
77
+ )
78
+
79
+ memory = ConversationBufferMemory(
80
+ memory_key="chat_history",
81
+ output_key='answer',
82
+ return_messages=True
83
+ )
84
+
85
+ retriever = vector_db.as_retriever()
86
+ qa_chain = ConversationalRetrievalChain.from_llm(
87
+ llm,
88
+ retriever=retriever,
89
+ chain_type="stuff",
90
+ memory=memory,
91
+ return_source_documents=True,
92
+ verbose=False,
93
+ )
94
+ return qa_chain
95
+
96
+ def format_chat_history(message, chat_history):
97
+ """Format chat history for the LLM"""
98
+ formatted_chat_history = []
99
+ for user_message, bot_message in chat_history:
100
+ formatted_chat_history.append(f"User: {user_message}")
101
+ formatted_chat_history.append(f"Assistant: {bot_message}")
102
+ return formatted_chat_history
103
+
104
+ @app.route('/upload', methods=['POST'])
105
+ def upload_pdf():
106
+ """Handle PDF upload and database initialization"""
107
+ global vector_db
108
+
109
+ if 'pdf_base64' not in request.json:
110
+ return jsonify({'error': 'No PDF data provided'}), 400
111
 
112
  try:
113
+ # Get base64 PDF and filename
114
+ pdf_base64 = request.json['pdf_base64']
115
+ filename = request.json.get('filename', 'uploaded.pdf')
116
+
117
+ # Create temp directory if it doesn't exist
118
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
119
+ temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
120
 
121
+ try:
122
+ # Decode and save PDF
123
+ pdf_data = base64.b64decode(pdf_base64)
124
+ with open(temp_path, 'wb') as f:
125
+ f.write(pdf_data)
126
+
127
+ # Process document
128
+ doc_splits = load_doc([temp_path])
129
+ vector_db = create_db(doc_splits)
130
+
131
+ return jsonify({'message': 'PDF processed successfully'}), 200
132
+ finally:
133
+ # Clean up
134
+ if os.path.exists(temp_path):
135
+ os.remove(temp_path)
136
+
137
+ except Exception as e:
138
+ return jsonify({'error': str(e)}), 500
139
+
140
+ @app.route('/initialize-llm', methods=['POST'])
141
+ def init_llm():
142
+ """Initialize the LLM with parameters"""
143
+ global qa_chain, vector_db
144
+
145
+ if vector_db is None:
146
+ return jsonify({'error': 'Please upload PDFs first'}), 400
147
+
148
+ data = request.json
149
+ model_name = data.get('model', 'llama') # default to llama
150
+ temperature = data.get('temperature', 0.5)
151
+ max_tokens = data.get('max_tokens', 4096)
152
+ top_k = data.get('top_k', 3)
153
+
154
+ if model_name not in LLM_MODELS:
155
+ return jsonify({'error': 'Invalid model name'}), 400
156
+
157
+ try:
158
+ qa_chain = initialize_llmchain(
159
+ LLM_MODELS[model_name],
160
+ temperature,
161
+ max_tokens,
162
+ top_k,
163
+ vector_db
164
  )
165
+ return jsonify({'message': 'LLM initialized successfully'}), 200
166
+ except Exception as e:
167
+ return jsonify({'error': str(e)}), 500
168
+
169
+ @app.route('/chat', methods=['POST'])
170
+ def chat():
171
+ """Handle chat interactions"""
172
+ global qa_chain
173
+
174
+ if qa_chain is None:
175
+ return jsonify({'error': 'LLM not initialized'}), 400
176
+
177
+ data = request.json
178
+ question = data.get('question')
179
+ chat_history = data.get('chat_history', [])
180
 
181
+ if not question:
182
+ return jsonify({'error': 'No question provided'}), 400
 
 
 
183
 
184
+ try:
185
+ formatted_history = format_chat_history(question, chat_history)
186
+ result = qa_chain({"question": question, "chat_history": formatted_history})
187
+
188
+ # Process the response
189
+ answer = result['answer']
190
+ if "Helpful Answer:" in answer:
191
+ answer = answer.split("Helpful Answer:")[-1]
192
+
193
+ # Extract sources
194
+ sources = []
195
+ for doc in result['source_documents'][:3]:
196
+ sources.append({
197
+ 'content': doc.page_content.strip(),
198
+ 'page': doc.metadata.get('page', 0) + 1 # Convert to 1-based page numbers
199
+ })
200
+
201
+ response = {
202
+ 'answer': answer,
203
+ 'sources': sources
204
+ }
205
+
206
+ return jsonify(response), 200
207
+ except Exception as e:
208
+ return jsonify({'error': str(e)}), 500
209
+
210
+ @app.route('/upload-local', methods=['POST'])
211
+ def upload_local():
212
+ """Handle PDF upload from local file system"""
213
+ global vector_db
214
+
215
+ data = request.json
216
+ file_path = data.get('file_path')
217
+
218
+ if not file_path or not os.path.exists(file_path):
219
+ return jsonify({'error': 'File not found'}), 400
220
+
221
+ try:
222
+ # Process document
223
+ doc_splits = load_doc([file_path])
224
+ vector_db = create_db(doc_splits)
225
+
226
+ return jsonify({'message': 'PDF processed successfully'}), 200
227
  except Exception as e:
228
+ return jsonify({'error': str(e)}), 500
229
+
230
+ @app.route('/start-upload', methods=['POST'])
231
+ def start_upload():
232
+ """Initialize a new file upload"""
233
+ global current_upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ data = request.json
236
+ current_upload = {
237
+ 'filename': data['filename'],
238
+ 'chunks': [],
239
+ 'filesize': data['filesize']
240
+ }
241
+ return jsonify({'message': 'Upload started'}), 200
242
+
243
+ @app.route('/upload-chunk', methods=['POST'])
244
+ def upload_chunk():
245
+ """Handle a chunk of the file"""
246
+ global current_upload
247
+
248
+ if not current_upload['filename']:
249
+ return jsonify({'error': 'No upload in progress'}), 400
250
+
251
+ try:
252
+ chunk = base64.b64decode(request.json['chunk'])
253
+ current_upload['chunks'].append(chunk)
254
+ return jsonify({'message': 'Chunk received'}), 200
255
+ except Exception as e:
256
+ return jsonify({'error': str(e)}), 500
257
+
258
+ @app.route('/finish-upload', methods=['POST'])
259
+ def finish_upload():
260
+ """Process the complete file"""
261
+ global current_upload, vector_db
262
+
263
+ if not current_upload['filename']:
264
+ return jsonify({'error': 'No upload in progress'}), 400
265
+
266
+ try:
267
+ # Create temp directory if it doesn't exist
268
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
269
+ temp_path = os.path.join(app.config['UPLOAD_FOLDER'], current_upload['filename'])
270
+
271
+ # Combine chunks and save file
272
+ with open(temp_path, 'wb') as f:
273
+ for chunk in current_upload['chunks']:
274
+ f.write(chunk)
275
+
276
+ # Process the PDF
277
+ doc_splits = load_doc([temp_path])
278
+ vector_db = create_db(doc_splits)
279
+
280
+ # Cleanup
281
+ os.remove(temp_path)
282
+ current_upload['chunks'] = []
283
+ current_upload['filename'] = None
284
+
285
+ return jsonify({'message': 'PDF processed successfully'}), 200
286
+ except Exception as e:
287
+ if os.path.exists(temp_path):
288
+ os.remove(temp_path)
289
+ return jsonify({'error': str(e)}), 500
290
 
291
  if __name__ == '__main__':
292
+ app.run(debug=True)