madji05 commited on
Commit
b4b32ed
1 Parent(s): 88699f3

Upload 7 files

Browse files
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
5
+ from datasets import Dataset
6
+ import torch
7
+ from flask import Flask, request, jsonify, render_template
8
+ from threading import Thread
9
+
10
+ app = Flask(__name__)
11
+
12
+ # إعداد المسارات
13
+ input_folder = 'input'
14
+ model_folder = 'model'
15
+
16
+ # وظيفة لتحويل PDF إلى نص باستخدام Tesseract
17
+ def pdf_to_text(file_path):
18
+ images = convert_from_path(file_path)
19
+ text = ''
20
+ for image in images:
21
+ text += pytesseract.image_to_string(image, lang='ara') # assuming Arabic language
22
+ return text
23
+
24
+ # وظيفة لتحضير البيانات
25
+ def prepare_data():
26
+ data = {'text': [], 'label': []}
27
+ labels = os.listdir(input_folder)
28
+ for label in labels:
29
+ label_folder = os.path.join(input_folder, label)
30
+ for file_name in os.listdir(label_folder):
31
+ file_path = os.path.join(label_folder, file_name)
32
+ text = pdf_to_text(file_path)
33
+ data['text'].append(text)
34
+ data['label'].append(label)
35
+ return Dataset.from_dict(data), labels
36
+
37
+ # دالة لتحميل النموذج والمحول
38
+ def load_model():
39
+ model_name = "bert-base-multilingual-cased"
40
+ tokenizer = AutoTokenizer.from_pretrained(model_folder)
41
+ model = AutoModelForSequenceClassification.from_pretrained(model_folder)
42
+ return tokenizer, model
43
+
44
+ # دالة لتدريب النموذج
45
+ def train_model():
46
+ global tokenizer, model, labels # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج
47
+
48
+ dataset, labels = prepare_data()
49
+ train_test_split = dataset.train_test_split(test_size=0.2)
50
+ tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)
51
+
52
+ training_args = TrainingArguments(
53
+ output_dir=model_folder,
54
+ evaluation_strategy="epoch",
55
+ learning_rate=2e-5,
56
+ per_device_train_batch_size=16,
57
+ per_device_eval_batch_size=16,
58
+ num_train_epochs=3,
59
+ weight_decay=0.01,
60
+ )
61
+
62
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))
63
+
64
+ trainer = Trainer(
65
+ model=model,
66
+ args=training_args,
67
+ train_dataset=tokenized_datasets['train'],
68
+ eval_dataset=tokenized_datasets['test'],
69
+ )
70
+
71
+ trainer.train()
72
+ # حفظ النموذج وجميع الملفات الضرورية
73
+ model.save_pretrained(model_folder)
74
+ tokenizer.save_pretrained(model_folder)
75
+ return "Model trained and saved!"
76
+
77
+ # دالة لتصنيف الوثائق
78
+ def classify_document(file_path):
79
+ text = pdf_to_text(file_path)
80
+ inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
81
+ outputs = model(**inputs)
82
+ predictions = torch.argmax(outputs.logits, dim=-1)
83
+ label = labels[predictions.item()]
84
+ return label, text
85
+
86
+ # واجهة ويب
87
+ @app.route('/')
88
+ def home():
89
+ return render_template('index.html')
90
+
91
+ @app.route('/train', methods=['POST'])
92
+ def train():
93
+ message = train_model()
94
+ return jsonify({'message': message})
95
+
96
+ @app.route('/classify', methods=['POST'])
97
+ def classify():
98
+ if 'file' not in request.files:
99
+ return jsonify({'error': 'No file provided'}), 400
100
+
101
+ file = request.files['file']
102
+ if file.filename == '':
103
+ return jsonify({'error': 'No file selected'}), 400
104
+
105
+ file_path = os.path.join('uploads', file.filename)
106
+ file.save(file_path)
107
+
108
+ label, text = classify_document(file_path)
109
+
110
+ return jsonify({'label': label, 'text': text})
111
+
112
+ def run_flask():
113
+ if os.path.exists(model_folder):
114
+ global tokenizer, model, labels
115
+ tokenizer, model = load_model()
116
+ labels = os.listdir(input_folder)
117
+ else:
118
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
119
+ model = None
120
+ labels = []
121
+ app.run()
122
+
123
+ if __name__ == '__main__':
124
+ Thread(target=run_flask).start()
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 3072,
8
+ "max_position_embeddings": 512,
9
+ "num_attention_heads": 12,
10
+ "num_hidden_layers": 12,
11
+ "type_vocab_size": 2,
12
+ "vocab_size": 30522
13
+ }
input/Nouveau document texte.txt ADDED
File without changes
model/Nouveau document texte.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask
2
+ pytesseract
3
+ pdf2image
4
+ transformers==4.26.1
5
+ torch
templates/index.html ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Document Classifier</title>
6
+ <style>
7
+ body {
8
+ font-family: Arial, sans-serif;
9
+ margin: 40px;
10
+ }
11
+ h1 {
12
+ color: #333;
13
+ }
14
+ .container {
15
+ max-width: 600px;
16
+ margin: auto;
17
+ }
18
+ .button {
19
+ display: inline-block;
20
+ padding: 10px 20px;
21
+ font-size: 16px;
22
+ cursor: pointer;
23
+ text-align: center;
24
+ text-decoration: none;
25
+ outline: none;
26
+ color: #fff;
27
+ background-color: #4CAF50;
28
+ border: none;
29
+ border-radius: 5px;
30
+ box-shadow: 0 4px #999;
31
+ }
32
+ .button:hover {background-color: #45a049}
33
+ .button:active {
34
+ background-color: #3e8e41;
35
+ box-shadow: 0 2px #666;
36
+ transform: translateY(2px);
37
+ }
38
+ .result {
39
+ margin-top: 20px;
40
+ }
41
+ </style>
42
+ </head>
43
+ <body>
44
+ <div class="container">
45
+ <h1>Document Classifier</h1>
46
+ <form id="upload-form" enctype="multipart/form-data">
47
+ <label for="file">Select a PDF file to classify:</label>
48
+ <input type="file" id="file" name="file" accept="application/pdf" required>
49
+ <button type="submit" class="button">Classify Document</button>
50
+ </form>
51
+ <button id="train-button" class="button">Train Model</button>
52
+ <div id="result" class="result"></div>
53
+ </div>
54
+
55
+ <script>
56
+ document.getElementById('upload-form').onsubmit = async function(event) {
57
+ event.preventDefault();
58
+ const fileInput = document.getElementById('file');
59
+ const formData = new FormData();
60
+ formData.append('file', fileInput.files[0]);
61
+
62
+ const response = await fetch('/classify', {
63
+ method: 'POST',
64
+ body: formData
65
+ });
66
+
67
+ const result = await response.json();
68
+ document.getElementById('result').innerText = 'Label: ' + result.label + '\nText: ' + result.text;
69
+ };
70
+
71
+ document.getElementById('train-button').onclick = async function() {
72
+ const response = await fetch('/train', {
73
+ method: 'POST'
74
+ });
75
+
76
+ const result = await response.json();
77
+ alert(result.message);
78
+ };
79
+ </script>
80
+ </body>
81
+ </html>
uploads/Nouveau document texte.txt ADDED
File without changes