File size: 4,223 Bytes
b4b32ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import os
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
from flask import Flask, request, jsonify, render_template
from threading import Thread
app = Flask(__name__)
# إعداد المسارات
input_folder = 'input'
model_folder = 'model'
# وظيفة لتحويل PDF إلى نص باستخدام Tesseract
def pdf_to_text(file_path):
images = convert_from_path(file_path)
text = ''
for image in images:
text += pytesseract.image_to_string(image, lang='ara') # assuming Arabic language
return text
# وظيفة لتحضير البيانات
def prepare_data():
data = {'text': [], 'label': []}
labels = os.listdir(input_folder)
for label in labels:
label_folder = os.path.join(input_folder, label)
for file_name in os.listdir(label_folder):
file_path = os.path.join(label_folder, file_name)
text = pdf_to_text(file_path)
data['text'].append(text)
data['label'].append(label)
return Dataset.from_dict(data), labels
# دالة لتحميل النموذج والمحول
def load_model():
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_folder)
return tokenizer, model
# دالة لتدريب النموذج
def train_model():
global tokenizer, model, labels # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج
dataset, labels = prepare_data()
train_test_split = dataset.train_test_split(test_size=0.2)
tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)
training_args = TrainingArguments(
output_dir=model_folder,
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
)
trainer.train()
# حفظ النموذج وجميع الملفات الضرورية
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)
return "Model trained and saved!"
# دالة لتصنيف الوثائق
def classify_document(file_path):
text = pdf_to_text(file_path)
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
label = labels[predictions.item()]
return label, text
# واجهة ويب
@app.route('/')
def home():
return render_template('index.html')
@app.route('/train', methods=['POST'])
def train():
message = train_model()
return jsonify({'message': message})
@app.route('/classify', methods=['POST'])
def classify():
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
file_path = os.path.join('uploads', file.filename)
file.save(file_path)
label, text = classify_document(file_path)
return jsonify({'label': label, 'text': text})
def run_flask():
if os.path.exists(model_folder):
global tokenizer, model, labels
tokenizer, model = load_model()
labels = os.listdir(input_folder)
else:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = None
labels = []
app.run()
if __name__ == '__main__':
Thread(target=run_flask).start()
|