pdf.ai / app.py
sdas2485's picture
Create app.py
90cf652 verified
import os
import fitz # PyMuPDF for PDFs
import pytesseract
from PIL import Image
import io
from flask import Flask, request, jsonify
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
# Initialize Flask app
app = Flask(__name__)
# Set Mistral API Key
os.environ["MISTRAL_API_KEY"] = "your_api_key_here"
client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))
# Set Tesseract Path for Windows (if needed)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def extract_text_from_pdf(pdf_path):
"""Extract text from PDFs, using OCR for scanned pages."""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
extracted_text = page.get_text("text")
# If no text, apply OCR (for scanned PDFs)
if not extracted_text.strip():
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes()))
extracted_text = pytesseract.image_to_string(img)
text += extracted_text + "\n"
return text
def query_mistral(pdf_text, user_query):
"""Send extracted text and user query to Mistral AI."""
messages = [
ChatMessage(role="system", content="You are an AI that answers questions based on PDFs."),
ChatMessage(role="user", content=f"Document content: {pdf_text[:3000]}... (truncated)"),
ChatMessage(role="user", content=f"User question: {user_query}")
]
response = client.chat(model="mistral-7b", messages=messages)
return response.choices[0].message.content
@app.route("/upload", methods=["POST"])
def upload_pdf():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
pdf_path = "uploaded.pdf"
file.save(pdf_path)
# Extract text
pdf_text = extract_text_from_pdf(pdf_path)
return jsonify({"message": "PDF uploaded and processed", "text": pdf_text[:500]}) # Preview
@app.route("/chat", methods=["POST"])
def chat():
data = request.json
user_query = data.get("query", "")
pdf_text = extract_text_from_pdf("uploaded.pdf")
response = query_mistral(pdf_text, user_query)
return jsonify({"response": response})
if __name__ == "__main__":
app.run(debug=True)