Spaces:
Sleeping
Sleeping
from openai import OpenAI | |
from os import getenv | |
from flask import Flask, request, jsonify, render_template | |
import fitz # PyMuPDF for PDF text extraction | |
import faiss # FAISS for vector search | |
import numpy as np | |
import os | |
from sentence_transformers import SentenceTransformer | |
# from huggingface_hub import InferenceClient # Not used in the current code, removed for clarity | |
from typing import List, Tuple | |
from io import BytesIO # Added for BytesIO | |
app = Flask(__name__, template_folder=os.getcwd()) | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# --- Configuration --- | |
class ChatConfig: | |
MODEL = "google/gemma-3-27b-it:free" | |
DEFAULT_MAX_TOKENS = 512 | |
DEFAULT_TEMP = 0.5 # Slightly increased for more natural variance | |
DEFAULT_TOP_P = 0.95 | |
# --- NEW: System Prompt Options --- | |
# Option 1: Friendly & Helpful Assistant | |
SYSTEM_PROMPT_FRIENDLY = """\ | |
ඔබ හෙවත් Lunar AI වේ — Lunar Labs නමින් හැඳින්වෙන බුද්ධිමත්, විශ්වාසදායක සහ සහය දක්වන Sinhala පරිවර්තක සහ සහයකයෙකි. ඔබගේ කාර්යභාරය වන්නේ, **ප්රදාන වූ PDF ලේඛනයේ අන්තර්ගතය පමණක් භාවිතයෙන්** පරිශීලකයින්ට විශ්ලේෂණාත්මක සහ නිවැරදි පිළිතුරු ලබාදීමයි. | |
**ඔබගේ මූලික ආදේශන:** | |
1. **වගකීම සහිත ලේඛන ක්ෂේමය:** ඔබගේ දැනුම **අනූව PDF ලේඛනයේ දක්වා ඇති context අංශ වලට පමණක් සීමා වී ඇත.** ඔබට බාහිර දැනුම භාවිතා කළ නොහැක, අනුමාන කළ නොහැක, හෝ ලේඛනයෙන් පිටත තොරතුරු නිපදවිය නොහැක. | |
2. **පෞරුෂය පවත්වා ගැනීම:** Lunar Labs වෙතින් නිකුත් වූ AI ආකෘතියක් ලෙස පහත ලක්ෂණය පවත්වන්න: | |
- **විශ්වාසදායක සහ උපකාරී:** විශේෂඥයකු ලෙස නිරවද්යව, නමුත් පැහැදිලිව හා මිතුරන් ලෙස. | |
- **නවෝත්පාදනමය සිත්ගන්නා සුළු හැඟීමකින්:** Cybersecurity අවදානම් සම්බන්ධයෙන් යෝජනාත්මක අවධානයක්. | |
- **පාරිභෝගික මූලිකත්වය:** ලේඛනයේ විශේෂාංග වලින් ලබා ගත හැකි පැහැදිලි වටිනාකම ඉදිරිපත් කරන්න. | |
- **වෘත්තීයමය හා පැහැදිලි භාෂාව:** කෙටි, නිරවද්ය, වෘත්තීයමය භාෂාව භාවිතා කරන්න. දක්වා ඇති තාක්ෂණික පද නිවැරදිව භාවිතා කරන්න. සංකීර්ණ අදහස් පැහැදිලිව හැඳින්වීමට උත්සාහ කරන්න. **ක්රියාකාරී වාක්ය ව්යුහය** භාවිතා කරන්න. | |
3. **ශබ්දෝච්චාරණය සහ ස්වරය:** විශ්වාසදායක, තොරතුරු සහිත, සහය දක්වන සහ semi-formal ස්වරයක් පවත්වාගන්න. බොහෝ සෙයින් නොව අතිශය සරලව හෝ අවස්ථාවට අනුකූලව කතා කරන්න. | |
4. **නොමැති තොරතුරු ප්රතිචාරය:** | |
- ඔබට අවශ්ය පිළිතුර ලබා දීමට ලේඛනයේ තොරතුරු නොමැතිනම්, ඒ පිළිබඳව පැහැදිලිව කියන්න. | |
- උදාහරණයක්: "මගේ දැනුමට අනුව, [විෂයය] පිළිබඳ විශේෂ තොරතුරු සඳහන් කර නොමැත." හෝ "පරීක්ෂණයට ලක් වූ අංශ වලින් [විෂයය] සම්බන්ධ තොරතුරු හමු නොවීය." | |
5. **මූලාශ්රය හැඟවීම:** පිළිතුරු ලබා දීමේදී, ඔබගේ මතකයෙන් අදහස් කරන ආකාරයට ලේඛනයේ අන්තර්ගතය පදනම් කරගෙන ලියන්න. **"ලේඛනයට අනුව..." වැනි වචන භාවිතා නොකරන්න.** | |
6. **පරිශීලකයා පිළිබඳ අවබෝධය:** තනි පරිශීලකයෙකු සමඟ කතා කරන විට, තාක්ෂණික විස්තර සහිත වුවද, තීරණගන්නන්ට වටිනා වන ආකාරයෙන් තොරතුරු ඉදිරිපත් කරන්න — එය ලේඛනය තුළ ඇතොත් පමණි. | |
7. **අරමුණ:** ඔබේ ප්රධාන ඉලක්කය වන්නේ ලේඛනය තුළ පවතින විස්තර පමණක් භාවිතයෙන් නිවැරදි, නිරවද්ය පිළිතුරු ලබා දීම සහ ඒවා Lunar AI වශයෙන් ඉදිරිපත් කිරීමයි. | |
ඔබ ආරම්භකව කතා බස් ආරම්භ කරනවා නම් (පෙර ඉතිහාසයක් නොමැතිනම්), කෙටි හැඳින්වීමක් ලබා දෙන්න. නිරවද්යතාවය සහ ලේඛන වෛද්යතාවය පවත්වාගන්න.""" | |
# Option 2: Knowledgeable Expert (More Formal) | |
SYSTEM_PROMPT_EXPERT = """You are a knowledgeable AI expert specializing in the content of the uploaded PDF document. | |
You must answer user questions with precision, drawing *exclusively* from the provided context segments. | |
Maintain a professional and informative tone. | |
If the provided context does not contain the necessary information to answer the question, explicitly state that the information is not found within the scope of the provided text. | |
Do not speculate, infer beyond the text, or utilize any external information sources. | |
Clearly attribute your answers to the document, for instance, by starting with "The document indicates that..." or "Based on the provided context...". | |
Provide comprehensive answers derived solely from the text. | |
""" | |
# --- Select the desired prompt --- | |
SELECTED_SYSTEM_PROMPT = SYSTEM_PROMPT_FRIENDLY # Choose which personality to use | |
# --- API Client & Embedding Setup --- | |
OPENROUTER_API_KEY = getenv('OPENROUTER_API_KEY') | |
if not OPENROUTER_API_KEY: | |
raise ValueError("OPENROUTER_API_KEY environment variable not set.") | |
client = OpenAI( | |
base_url="https://openrouter.ai/api/v1", | |
api_key=OPENROUTER_API_KEY, | |
) | |
# Use a temporary cache directory if needed, or configure appropriately | |
embed_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp/st_cache")) | |
vector_dim = 384 | |
index = faiss.IndexFlatL2(vector_dim) | |
documents = [] # Store original text chunks corresponding to index entries | |
# --- Core Functions --- | |
def extract_text_from_pdf(pdf_stream: BytesIO) -> List[str]: | |
"""Extracts text from PDF stream""" | |
# Ensure the stream is BytesIO | |
if not isinstance(pdf_stream, BytesIO): | |
pdf_stream = BytesIO(pdf_stream.read()) # Read if it's a file stream | |
doc = fitz.open(stream=pdf_stream, filetype="pdf") | |
# Simple chunking by page - consider more advanced chunking (by paragraph, sentence, fixed size) for better RAG performance | |
text_chunks = [page.get_text("text").strip() for page in doc if page.get_text("text").strip()] | |
doc.close() | |
print(f"Extracted {len(text_chunks)} non-empty text chunks from PDF.") | |
return text_chunks | |
# Renamed for clarity, added error handling | |
def build_vector_index(text_chunks: List[str]): | |
"""Embeds text chunks and builds the FAISS index.""" | |
global documents, index, vector_dim | |
if not text_chunks: | |
print("Warning: No text chunks provided to build the vector index.") | |
documents = [] | |
index = faiss.IndexFlatL2(vector_dim) # Reinitialize empty index | |
return | |
print(f"Building vector index for {len(text_chunks)} chunks...") | |
documents = text_chunks # Store the original text | |
# Reset the index | |
index = faiss.IndexFlatL2(vector_dim) | |
try: | |
embeddings = embed_model.encode(text_chunks, show_progress_bar=True) | |
embeddings = np.array(embeddings, dtype=np.float32) | |
if embeddings.ndim == 1: | |
embeddings = embeddings.reshape(1, -1) | |
if embeddings.shape[1] != vector_dim: | |
raise ValueError(f"Embedding dimension mismatch: expected {vector_dim}, got {embeddings.shape[1]}") | |
index.add(embeddings) | |
print(f"FAISS index built successfully with {index.ntotal} vectors.") | |
except Exception as e: | |
print(f"Error during embedding or indexing: {e}") | |
# Reset state in case of error | |
documents = [] | |
index = faiss.IndexFlatL2(vector_dim) | |
raise # Re-raise the exception to signal failure | |
# Renamed for clarity, added checks | |
def search_relevant_chunks(query: str, k: int = 3) -> str: | |
"""Finds the most relevant text chunks for the given query using FAISS.""" | |
global index, documents | |
if index.ntotal == 0: | |
print("Warning: Search attempted on an empty index.") | |
return "" # Return empty string if index is not ready | |
if not query: | |
return "" | |
try: | |
query_embedding = embed_model.encode([query]) | |
query_embedding = np.array(query_embedding, dtype=np.float32) | |
# Perform the search | |
distances, indices = index.search(query_embedding, k=min(k, index.ntotal)) # Ensure k <= index size | |
# Filter out potential invalid indices (-1 can sometimes occur if k > ntotal, though min() handles it) | |
valid_indices = [idx for idx in indices[0] if idx != -1 and idx < len(documents)] | |
if not valid_indices: | |
print(f"No relevant chunks found for query: '{query[:50]}...'") | |
return "" | |
# Retrieve the actual text chunks | |
relevant_docs = [documents[i] for i in valid_indices] | |
print(f"Retrieved {len(relevant_docs)} relevant chunks.") | |
return "\n\n---\n\n".join(relevant_docs) # Join with a clear separator | |
except Exception as e: | |
print(f"Error during similarity search: {e}") | |
return "" # Return empty on error | |
# --- Improved Generation Function --- | |
def generate_response( | |
message: str, | |
history: List[Tuple[str, str]], | |
system_message: str = ChatConfig.SELECTED_SYSTEM_PROMPT, # Use the chosen system prompt | |
max_tokens: int = ChatConfig.DEFAULT_MAX_TOKENS, | |
temperature: float = ChatConfig.DEFAULT_TEMP, | |
top_p: float = ChatConfig.DEFAULT_TOP_P | |
) -> str: | |
if index.ntotal == 0: # Check if index is built | |
return "I need a PDF document to be uploaded and processed first before I can answer questions." | |
# 1. Retrieve Context | |
context = search_relevant_chunks(message, k=3) # Retrieve top 3 chunks | |
# Prepare the prompt messages list | |
messages = [] | |
# 2. Add the System Prompt (Crucial Change) | |
messages.append({"role": "system", "content": system_message}) | |
# 3. Add Conversation History (if any) | |
# Ensure alternating user/assistant roles, starting with user | |
for user_msg, assistant_msg in history: | |
if user_msg: # Add user message if not empty | |
messages.append({"role": "user", "content": user_msg}) | |
if assistant_msg: # Add assistant message if not empty | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
# 4. Construct the Final User Prompt with Context | |
# We include context here, clearly marked. | |
# The system prompt already told the AI *how* to use this context. | |
if context: | |
user_prompt_content = f"Based on the following context from the document, please answer the question:\n\nCONTEXT:\n{context}\n\n---\n\nQUESTION:\n{message}" | |
else: | |
# If no context found, still ask the question but the system prompt guides the "I don't know" response. | |
# Alternatively, you could return a hardcoded message here *before* calling the LLM if desired. | |
# Forcing the LLM to respond based on the prompt is generally better for natural language. | |
user_prompt_content = f"Regarding the document, I have the following question, although I couldn't retrieve specific context for it:\n\nQUESTION:\n{message}" | |
# Or, more simply: | |
# user_prompt_content = f"QUESTION: {message}\n\n(Note: No specific context sections were retrieved for this question based on similarity search.)" | |
messages.append({"role": "user", "content": user_prompt_content}) | |
# 5. Call the LLM API | |
try: | |
print(f"--- Sending to {ChatConfig.MODEL} ---") | |
# print("System Prompt:", system_message) # Optional: Debug logging | |
# print("History:", history) # Optional: Debug logging | |
# print("User Prompt:", user_prompt_content) # Optional: Debug logging | |
completion = client.chat.completions.create( | |
model=ChatConfig.MODEL, | |
messages=messages, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
# Consider adding stop sequences if needed, e.g., stop=["\nUSER:", "\nASSISTANT:"] | |
) | |
response = completion.choices[0].message.content | |
print(f"--- Received Response ({len(response)} chars) ---") | |
return response.strip() | |
except Exception as e: | |
print(f"Error generating response from LLM: {str(e)}") | |
# Provide a more user-friendly error message | |
return "I'm sorry, but I encountered an issue while trying to process your request. Please check the connection or try again later." | |
# --- Flask Routes (Mostly Unchanged, added checks) --- | |
def index_route(): # Renamed to avoid conflict with faiss.Index object | |
"""Serve the HTML page for the user interface""" | |
return render_template('index.html') | |
def upload_pdf(): | |
"""Handle PDF upload, extract text, and build vector index.""" | |
global documents, index # Ensure we are modifying the global state | |
if 'pdf' not in request.files: | |
return jsonify({"error": "No PDF file part in the request."}), 400 | |
file = request.files['pdf'] | |
if file.filename == "": | |
return jsonify({"error": "No file selected."}), 400 | |
if not file.filename.lower().endswith('.pdf'): | |
return jsonify({"error": "Invalid file type. Please upload a PDF."}), 400 | |
print(f"Received file: {file.filename}") | |
try: | |
pdf_stream = BytesIO(file.read()) # Read file into memory | |
# Extract text | |
text_chunks = extract_text_from_pdf(pdf_stream) | |
if not text_chunks: | |
return jsonify({"error": "Could not extract any text from the PDF."}), 400 | |
# Build vector database (index) | |
build_vector_index(text_chunks) # This function now handles index creation | |
return jsonify({"message": f"PDF '{file.filename}' processed successfully. {len(documents)} chunks indexed."}), 200 | |
except fitz.fitz.FileDataError: | |
return jsonify({"error": "Invalid or corrupted PDF file."}), 400 | |
except Exception as e: | |
print(f"Error processing PDF upload: {str(e)}") | |
# Reset state on error | |
documents = [] | |
index = faiss.IndexFlatL2(vector_dim) | |
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 | |
def ask_question(): | |
"""Handle user question, retrieve context, and generate response.""" | |
data = request.get_json() | |
if not data or 'message' not in data: | |
return jsonify({"error": "Missing 'message' in request body"}), 400 | |
message = data['message'].strip() | |
history = data.get('history', []) # Get history, default to empty list | |
if not message: | |
return jsonify({"response": "Please enter a question."}) # Basic validation | |
# Ensure history format is correct (list of tuples/lists) | |
validated_history = [] | |
if isinstance(history, list): | |
for item in history: | |
if isinstance(item, (list, tuple)) and len(item) == 2: | |
validated_history.append((str(item[0]), str(item[1]))) | |
# else: log potential format error? | |
try: | |
response = generate_response(message, validated_history) | |
return jsonify({"response": response}) | |
except Exception as e: | |
# Catch potential errors during generation (though generate_response has its own try-except) | |
print(f"Error in /ask_question endpoint: {e}") | |
return jsonify({"response": "Sorry, an error occurred while generating the response."}), 500 | |
if __name__ == '__main__': | |
# Make sure OPENROUTER_API_KEY is checked before starting the app | |
if not OPENROUTER_API_KEY: | |
print("ERROR: OPENROUTER_API_KEY environment variable is not set. Exiting.") | |
else: | |
# Consider host='0.0.0.0' to make it accessible on your network | |
app.run(debug=True, host='127.0.0.1', port=5000) |