ma / app.py
madji05's picture
Upload app.py
700e582 verified
raw
history blame
4.73 kB
import os
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
from flask import Flask, request, jsonify, render_template
from threading import Thread
import gradio as gr
app = Flask(__name__)
# إعداد المسارات
input_folder = 'input'
model_folder = 'model'
# وظيفة لتحويل PDF إلى نص باستخدام Tesseract
def pdf_to_text(file_path):
images = convert_from_path(file_path)
text = ''
for image in images:
text += pytesseract.image_to_string(image, lang='ara') # assuming Arabic language
return text
# وظيفة لتحضير البيانات
def prepare_data():
data = {'text': [], 'label': []}
labels = os.listdir(input_folder)
for label in labels:
label_folder = os.path.join(input_folder, label)
for file_name in os.listdir(label_folder):
file_path = os.path.join(label_folder, file_name)
text = pdf_to_text(file_path)
data['text'].append(text)
data['label'].append(label)
return Dataset.from_dict(data), labels
# دالة لتحميل النموذج والمحول
def load_model():
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_folder)
return tokenizer, model
# دالة لتدريب النموذج
def train_model():
global tokenizer, model, labels # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج
dataset, labels = prepare_data()
train_test_split = dataset.train_test_split(test_size=0.2)
tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)
training_args = TrainingArguments(
output_dir=model_folder,
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
)
trainer.train()
# حفظ النموذج وجميع الملفات الضرورية
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)
return "Model trained and saved!"
# دالة لتصنيف الوثائق
def classify_document(file_path):
text = pdf_to_text(file_path)
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
label = labels[predictions.item()]
return label, text
# واجهة ويب
@app.route('/')
def home():
return render_template('index.html')
@app.route('/train', methods=['POST'])
def train():
message = train_model()
return jsonify({'message': message})
@app.route('/classify', methods=['POST'])
def classify():
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
file_path = os.path.join('uploads', file.filename)
file.save(file_path)
label, text = classify_document(file_path)
return jsonify({'label': label, 'text': text})
def run_flask():
if os.path.exists(model_folder):
global tokenizer, model, labels
tokenizer, model = load_model()
labels = os.listdir(input_folder)
else:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = None
labels = []
app.run(port=5000)
# تشغيل Gradio
def run_gradio():
def classify(text):
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
label = labels[predictions.item()]
return label
gr.Interface(fn=classify, inputs="text", outputs="text").launch(server_name="0.0.0.0", server_port=7860)
if __name__ == '__main__':
Thread(target=run_flask).start()
Thread(target=run_gradio).start()