File size: 4,726 Bytes
b4b32ed
 
 
 
 
 
 
 
700e582
b4b32ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700e582
 
 
 
 
 
 
 
 
 
 
 
b4b32ed
 
 
700e582
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
from flask import Flask, request, jsonify, render_template
from threading import Thread
import gradio as gr

app = Flask(__name__)

# إعداد المسارات
input_folder = 'input'
model_folder = 'model'

# وظيفة لتحويل PDF إلى نص باستخدام Tesseract
def pdf_to_text(file_path):
    images = convert_from_path(file_path)
    text = ''
    for image in images:
        text += pytesseract.image_to_string(image, lang='ara')  # assuming Arabic language
    return text

# وظيفة لتحضير البيانات
def prepare_data():
    data = {'text': [], 'label': []}
    labels = os.listdir(input_folder)
    for label in labels:
        label_folder = os.path.join(input_folder, label)
        for file_name in os.listdir(label_folder):
            file_path = os.path.join(label_folder, file_name)
            text = pdf_to_text(file_path)
            data['text'].append(text)
            data['label'].append(label)
    return Dataset.from_dict(data), labels

# دالة لتحميل النموذج والمحول
def load_model():
    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_folder)
    model = AutoModelForSequenceClassification.from_pretrained(model_folder)
    return tokenizer, model

# دالة لتدريب النموذج
def train_model():
    global tokenizer, model, labels  # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج

    dataset, labels = prepare_data()
    train_test_split = dataset.train_test_split(test_size=0.2)
    tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)

    training_args = TrainingArguments(
        output_dir=model_folder,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
    )

    trainer.train()
    # حفظ النموذج وجميع الملفات الضرورية
    model.save_pretrained(model_folder)
    tokenizer.save_pretrained(model_folder)
    return "Model trained and saved!"

# دالة لتصنيف الوثائق
def classify_document(file_path):
    text = pdf_to_text(file_path)
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    label = labels[predictions.item()]
    return label, text

# واجهة ويب
@app.route('/')
def home():
    return render_template('index.html')

@app.route('/train', methods=['POST'])
def train():
    message = train_model()
    return jsonify({'message': message})

@app.route('/classify', methods=['POST'])
def classify():
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    file_path = os.path.join('uploads', file.filename)
    file.save(file_path)
    
    label, text = classify_document(file_path)
    
    return jsonify({'label': label, 'text': text})

def run_flask():
    if os.path.exists(model_folder):
        global tokenizer, model, labels
        tokenizer, model = load_model()
        labels = os.listdir(input_folder)
    else:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        model = None
        labels = []
    app.run(port=5000)

# تشغيل Gradio
def run_gradio():
    def classify(text):
        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        label = labels[predictions.item()]
        return label

    gr.Interface(fn=classify, inputs="text", outputs="text").launch(server_name="0.0.0.0", server_port=7860)

if __name__ == '__main__':
    Thread(target=run_flask).start()
    Thread(target=run_gradio).start()