Spaces:
Sleeping
Sleeping
import os | |
from flask import Flask, request, jsonify | |
from werkzeug.utils import secure_filename | |
from transformers import pipeline | |
from pdf2image import convert_from_path | |
from PIL import Image | |
# Initialize Flask app | |
app = Flask(__name__) | |
# Set upload folder | |
UPLOAD_FOLDER = 'uploads' | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | |
# Allowed file extensions | |
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'pdf'} | |
# Load TrOCR Model | |
ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-small-printed") | |
def allowed_file(filename): | |
"""Check if the file has an allowed extension.""" | |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
def extract_text_from_image(image_path): | |
"""Extract text from a single image using TrOCR.""" | |
image = Image.open(image_path).convert("RGB") | |
text = ocr_pipeline(image)[0]['generated_text'] | |
return text | |
def extract_text_from_pdf(pdf_path): | |
"""Convert PDF to images and extract text from each page.""" | |
images = convert_from_path(pdf_path) | |
extracted_text = "" | |
for img in images: | |
text = extract_text_from_image(img) | |
extracted_text += text + "\n" | |
return extracted_text.strip() | |
def upload_file(): | |
"""Handle file upload and text extraction.""" | |
if 'file' not in request.files: | |
return jsonify({"error": "No file uploaded"}), 400 | |
file = request.files['file'] | |
if file.filename == '': | |
return jsonify({"error": "No file selected"}), 400 | |
if not allowed_file(file.filename): | |
return jsonify({"error": "Invalid file type. Allowed: PNG, JPG, JPEG, PDF."}), 400 | |
# Save uploaded file | |
filename = secure_filename(file.filename) | |
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) | |
file.save(file_path) | |
# Process image or PDF | |
if filename.lower().endswith(".pdf"): | |
extracted_text = extract_text_from_pdf(file_path) | |
else: | |
extracted_text = extract_text_from_image(file_path) | |
return jsonify({"extracted_text": extracted_text}) | |
# Run Flask App | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=5000, debug=True) |