NanobotzAI commited on
Commit
21753a3
·
verified ·
1 Parent(s): 880c901

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -51
app.py CHANGED
@@ -1,10 +1,12 @@
1
- from flask import Flask, request, jsonify, send_from_directory
2
  import fitz # PyMuPDF for PDF text extraction
3
  import faiss # FAISS for vector search
4
  import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
  from huggingface_hub import InferenceClient
7
- import os
 
 
8
 
9
  # Default settings
10
  class ChatConfig:
@@ -14,74 +16,91 @@ class ChatConfig:
14
  DEFAULT_TEMP = 0.3
15
  DEFAULT_TOP_P = 0.95
16
 
17
- HF_TOKEN = os.getenv("HF_TOKEN") # Fetch from environment variables
18
-
19
- client = InferenceClient(ChatConfig.MODEL, token=HF_TOKEN)
20
  embed_model = SentenceTransformer("all-MiniLM-L6-v2") # Lightweight embedding model
21
  vector_dim = 384 # Embedding size
22
  index = faiss.IndexFlatL2(vector_dim) # FAISS index
23
 
24
  documents = [] # Store extracted text
25
 
26
- app = Flask(__name__)
27
-
28
- @app.route("/")
29
- def serve_homepage():
30
- """Serves the HTML interface."""
31
- return send_from_directory(os.getcwd(), 'index.html')
32
-
33
- @app.route("/upload_pdf/", methods=["POST"])
34
- def upload_pdf():
35
- """Handles PDF file processing."""
36
- global documents
37
- file = request.files['file']
38
-
39
- # Save the uploaded file temporarily
40
- file_path = os.path.join(os.getcwd(), file.filename)
41
- file.save(file_path)
42
-
43
- # Extract text from PDF
44
- doc = fitz.open(file_path)
45
  text_chunks = [page.get_text("text") for page in doc]
46
-
47
- # Create vector database
 
 
 
48
  documents = text_chunks
49
  embeddings = embed_model.encode(text_chunks)
50
  index.add(np.array(embeddings, dtype=np.float32))
51
 
52
- return jsonify({"message": "PDF uploaded and indexed successfully!"})
53
-
54
- @app.route("/chat/", methods=["GET"])
55
- def chat_with_pdf():
56
- """Handles user queries and returns AI-generated responses."""
57
- msg = request.args.get("msg")
58
-
 
 
 
 
 
 
 
59
  if not documents:
60
- return jsonify({"response": "Please upload a PDF first."})
61
 
62
- # Retrieve relevant context
63
- query_embedding = embed_model.encode([msg])
64
- _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), k=3)
65
- context = "\n".join([documents[i] for i in closest_idx[0]])
 
 
 
 
66
 
67
- # Generate AI response
68
- messages = [
69
- {"role": "system", "content": ChatConfig.DEFAULT_SYSTEM_MSG},
70
- {"role": "user", "content": f"Context: {context}\nQuestion: {msg}"}
71
- ]
72
 
73
- response_text = ""
74
  for chunk in client.chat_completion(
75
  messages,
76
- max_tokens=ChatConfig.DEFAULT_MAX_TOKENS,
77
  stream=True,
78
- temperature=ChatConfig.DEFAULT_TEMP,
79
- top_p=ChatConfig.DEFAULT_TOP_P,
80
  ):
81
  token = chunk.choices[0].delta.content or ""
82
- response_text += token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- return jsonify({"response": response_text})
 
 
 
 
 
 
85
 
86
- if __name__ == "__main__":
87
- app.run(host="0.0.0.0", port=8000)
 
1
+ from flask import Flask, request, jsonify, render_template
2
  import fitz # PyMuPDF for PDF text extraction
3
  import faiss # FAISS for vector search
4
  import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
  from huggingface_hub import InferenceClient
7
+ from typing import List, Tuple
8
+
9
+ app = Flask(__name__)
10
 
11
  # Default settings
12
  class ChatConfig:
 
16
  DEFAULT_TEMP = 0.3
17
  DEFAULT_TOP_P = 0.95
18
 
19
+ client = InferenceClient(ChatConfig.MODEL)
 
 
20
  embed_model = SentenceTransformer("all-MiniLM-L6-v2") # Lightweight embedding model
21
  vector_dim = 384 # Embedding size
22
  index = faiss.IndexFlatL2(vector_dim) # FAISS index
23
 
24
  documents = [] # Store extracted text
25
 
26
+ def extract_text_from_pdf(pdf_path):
27
+ """Extracts text from PDF"""
28
+ doc = fitz.open(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  text_chunks = [page.get_text("text") for page in doc]
30
+ return text_chunks
31
+
32
+ def create_vector_db(text_chunks):
33
+ """Embeds text chunks and adds them to FAISS index"""
34
+ global documents, index
35
  documents = text_chunks
36
  embeddings = embed_model.encode(text_chunks)
37
  index.add(np.array(embeddings, dtype=np.float32))
38
 
39
+ def search_relevant_text(query):
40
+ """Finds the most relevant text chunk for the given query"""
41
+ query_embedding = embed_model.encode([query])
42
+ _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), k=3)
43
+ return "\n".join([documents[i] for i in closest_idx[0]])
44
+
45
+ def generate_response(
46
+ message: str,
47
+ history: List[Tuple[str, str]],
48
+ system_message: str = ChatConfig.DEFAULT_SYSTEM_MSG,
49
+ max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS,
50
+ temperature: float = ChatConfig.DEFAULT_TEMP,
51
+ top_p: float = ChatConfig.DEFAULT_TOP_P
52
+ ) -> str:
53
  if not documents:
54
+ return "Please upload a PDF first."
55
 
56
+ context = search_relevant_text(message) # Get relevant content from PDF
57
+
58
+ messages = [{"role": "system", "content": system_message}]
59
+ for user_msg, bot_msg in history:
60
+ if user_msg:
61
+ messages.append({"role": "user", "content": user_msg})
62
+ if bot_msg:
63
+ messages.append({"role": "assistant", "content": bot_msg})
64
 
65
+ messages.append({"role": "user", "content": f"Context: {context}\nQuestion: {message}"})
 
 
 
 
66
 
67
+ response = ""
68
  for chunk in client.chat_completion(
69
  messages,
70
+ max_tokens=max_tokens,
71
  stream=True,
72
+ temperature=temperature,
73
+ top_p=top_p,
74
  ):
75
  token = chunk.choices[0].delta.content or ""
76
+ response += token
77
+ return response
78
+
79
+ @app.route('/')
80
+ def index():
81
+ """Serve the HTML page for the user interface"""
82
+ return render_template('index.html')
83
+
84
+ @app.route('/upload_pdf', methods=['POST'])
85
+ def upload_pdf():
86
+ """Handle PDF upload"""
87
+ file = request.files['pdf']
88
+ pdf_path = f"uploaded_files/{file.filename}"
89
+ file.save(pdf_path)
90
+
91
+ # Extract text and create vector database
92
+ text_chunks = extract_text_from_pdf(pdf_path)
93
+ create_vector_db(text_chunks)
94
+
95
+ return jsonify({"message": "PDF uploaded and indexed successfully!"})
96
 
97
+ @app.route('/ask_question', methods=['POST'])
98
+ def ask_question():
99
+ """Handle user question"""
100
+ message = request.json.get('message')
101
+ history = request.json.get('history', [])
102
+ response = generate_response(message, history)
103
+ return jsonify({"response": response})
104
 
105
+ if __name__ == '__main__':
106
+ app.run(debug=True)