File size: 4,726 Bytes
b4b32ed 700e582 b4b32ed 700e582 b4b32ed 700e582 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
from flask import Flask, request, jsonify, render_template
from threading import Thread
import gradio as gr
app = Flask(__name__)
# إعداد المسارات
input_folder = 'input'
model_folder = 'model'
# وظيفة لتحويل PDF إلى نص باستخدام Tesseract
def pdf_to_text(file_path):
images = convert_from_path(file_path)
text = ''
for image in images:
text += pytesseract.image_to_string(image, lang='ara') # assuming Arabic language
return text
# وظيفة لتحضير البيانات
def prepare_data():
data = {'text': [], 'label': []}
labels = os.listdir(input_folder)
for label in labels:
label_folder = os.path.join(input_folder, label)
for file_name in os.listdir(label_folder):
file_path = os.path.join(label_folder, file_name)
text = pdf_to_text(file_path)
data['text'].append(text)
data['label'].append(label)
return Dataset.from_dict(data), labels
# دالة لتحميل النموذج والمحول
def load_model():
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_folder)
return tokenizer, model
# دالة لتدريب النموذج
def train_model():
global tokenizer, model, labels # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج
dataset, labels = prepare_data()
train_test_split = dataset.train_test_split(test_size=0.2)
tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)
training_args = TrainingArguments(
output_dir=model_folder,
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
)
trainer.train()
# حفظ النموذج وجميع الملفات الضرورية
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)
return "Model trained and saved!"
# دالة لتصنيف الوثائق
def classify_document(file_path):
text = pdf_to_text(file_path)
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
label = labels[predictions.item()]
return label, text
# واجهة ويب
@app.route('/')
def home():
return render_template('index.html')
@app.route('/train', methods=['POST'])
def train():
message = train_model()
return jsonify({'message': message})
@app.route('/classify', methods=['POST'])
def classify():
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
file_path = os.path.join('uploads', file.filename)
file.save(file_path)
label, text = classify_document(file_path)
return jsonify({'label': label, 'text': text})
def run_flask():
if os.path.exists(model_folder):
global tokenizer, model, labels
tokenizer, model = load_model()
labels = os.listdir(input_folder)
else:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = None
labels = []
app.run(port=5000)
# تشغيل Gradio
def run_gradio():
def classify(text):
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
label = labels[predictions.item()]
return label
gr.Interface(fn=classify, inputs="text", outputs="text").launch(server_name="0.0.0.0", server_port=7860)
if __name__ == '__main__':
Thread(target=run_flask).start()
Thread(target=run_gradio).start()
|