Upload 7 files
Browse files- app.py +124 -0
- config.json +13 -0
- input/Nouveau document texte.txt +0 -0
- model/Nouveau document texte.txt +0 -0
- requirements.txt +5 -0
- templates/index.html +81 -0
- uploads/Nouveau document texte.txt +0 -0
app.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pytesseract
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
5 |
+
from datasets import Dataset
|
6 |
+
import torch
|
7 |
+
from flask import Flask, request, jsonify, render_template
|
8 |
+
from threading import Thread
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
|
12 |
+
# إعداد المسارات
|
13 |
+
input_folder = 'input'
|
14 |
+
model_folder = 'model'
|
15 |
+
|
16 |
+
# وظيفة لتحويل PDF إلى نص باستخدام Tesseract
|
17 |
+
def pdf_to_text(file_path):
|
18 |
+
images = convert_from_path(file_path)
|
19 |
+
text = ''
|
20 |
+
for image in images:
|
21 |
+
text += pytesseract.image_to_string(image, lang='ara') # assuming Arabic language
|
22 |
+
return text
|
23 |
+
|
24 |
+
# وظيفة لتحضير البيانات
|
25 |
+
def prepare_data():
|
26 |
+
data = {'text': [], 'label': []}
|
27 |
+
labels = os.listdir(input_folder)
|
28 |
+
for label in labels:
|
29 |
+
label_folder = os.path.join(input_folder, label)
|
30 |
+
for file_name in os.listdir(label_folder):
|
31 |
+
file_path = os.path.join(label_folder, file_name)
|
32 |
+
text = pdf_to_text(file_path)
|
33 |
+
data['text'].append(text)
|
34 |
+
data['label'].append(label)
|
35 |
+
return Dataset.from_dict(data), labels
|
36 |
+
|
37 |
+
# دالة لتحميل النموذج والمحول
|
38 |
+
def load_model():
|
39 |
+
model_name = "bert-base-multilingual-cased"
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained(model_folder)
|
41 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_folder)
|
42 |
+
return tokenizer, model
|
43 |
+
|
44 |
+
# دالة لتدريب النموذج
|
45 |
+
def train_model():
|
46 |
+
global tokenizer, model, labels # تأكد من أن هذه المتغيرات متاحة في جميع أنحاء البرنامج
|
47 |
+
|
48 |
+
dataset, labels = prepare_data()
|
49 |
+
train_test_split = dataset.train_test_split(test_size=0.2)
|
50 |
+
tokenized_datasets = train_test_split.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)
|
51 |
+
|
52 |
+
training_args = TrainingArguments(
|
53 |
+
output_dir=model_folder,
|
54 |
+
evaluation_strategy="epoch",
|
55 |
+
learning_rate=2e-5,
|
56 |
+
per_device_train_batch_size=16,
|
57 |
+
per_device_eval_batch_size=16,
|
58 |
+
num_train_epochs=3,
|
59 |
+
weight_decay=0.01,
|
60 |
+
)
|
61 |
+
|
62 |
+
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels))
|
63 |
+
|
64 |
+
trainer = Trainer(
|
65 |
+
model=model,
|
66 |
+
args=training_args,
|
67 |
+
train_dataset=tokenized_datasets['train'],
|
68 |
+
eval_dataset=tokenized_datasets['test'],
|
69 |
+
)
|
70 |
+
|
71 |
+
trainer.train()
|
72 |
+
# حفظ النموذج وجميع الملفات الضرورية
|
73 |
+
model.save_pretrained(model_folder)
|
74 |
+
tokenizer.save_pretrained(model_folder)
|
75 |
+
return "Model trained and saved!"
|
76 |
+
|
77 |
+
# دالة لتصنيف الوثائق
|
78 |
+
def classify_document(file_path):
|
79 |
+
text = pdf_to_text(file_path)
|
80 |
+
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
|
81 |
+
outputs = model(**inputs)
|
82 |
+
predictions = torch.argmax(outputs.logits, dim=-1)
|
83 |
+
label = labels[predictions.item()]
|
84 |
+
return label, text
|
85 |
+
|
86 |
+
# واجهة ويب
|
87 |
+
@app.route('/')
|
88 |
+
def home():
|
89 |
+
return render_template('index.html')
|
90 |
+
|
91 |
+
@app.route('/train', methods=['POST'])
|
92 |
+
def train():
|
93 |
+
message = train_model()
|
94 |
+
return jsonify({'message': message})
|
95 |
+
|
96 |
+
@app.route('/classify', methods=['POST'])
|
97 |
+
def classify():
|
98 |
+
if 'file' not in request.files:
|
99 |
+
return jsonify({'error': 'No file provided'}), 400
|
100 |
+
|
101 |
+
file = request.files['file']
|
102 |
+
if file.filename == '':
|
103 |
+
return jsonify({'error': 'No file selected'}), 400
|
104 |
+
|
105 |
+
file_path = os.path.join('uploads', file.filename)
|
106 |
+
file.save(file_path)
|
107 |
+
|
108 |
+
label, text = classify_document(file_path)
|
109 |
+
|
110 |
+
return jsonify({'label': label, 'text': text})
|
111 |
+
|
112 |
+
def run_flask():
|
113 |
+
if os.path.exists(model_folder):
|
114 |
+
global tokenizer, model, labels
|
115 |
+
tokenizer, model = load_model()
|
116 |
+
labels = os.listdir(input_folder)
|
117 |
+
else:
|
118 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
|
119 |
+
model = None
|
120 |
+
labels = []
|
121 |
+
app.run()
|
122 |
+
|
123 |
+
if __name__ == '__main__':
|
124 |
+
Thread(target=run_flask).start()
|
config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_probs_dropout_prob": 0.1,
|
3 |
+
"hidden_act": "gelu",
|
4 |
+
"hidden_dropout_prob": 0.1,
|
5 |
+
"hidden_size": 768,
|
6 |
+
"initializer_range": 0.02,
|
7 |
+
"intermediate_size": 3072,
|
8 |
+
"max_position_embeddings": 512,
|
9 |
+
"num_attention_heads": 12,
|
10 |
+
"num_hidden_layers": 12,
|
11 |
+
"type_vocab_size": 2,
|
12 |
+
"vocab_size": 30522
|
13 |
+
}
|
input/Nouveau document texte.txt
ADDED
File without changes
|
model/Nouveau document texte.txt
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask
|
2 |
+
pytesseract
|
3 |
+
pdf2image
|
4 |
+
transformers==4.26.1
|
5 |
+
torch
|
templates/index.html
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<title>Document Classifier</title>
|
6 |
+
<style>
|
7 |
+
body {
|
8 |
+
font-family: Arial, sans-serif;
|
9 |
+
margin: 40px;
|
10 |
+
}
|
11 |
+
h1 {
|
12 |
+
color: #333;
|
13 |
+
}
|
14 |
+
.container {
|
15 |
+
max-width: 600px;
|
16 |
+
margin: auto;
|
17 |
+
}
|
18 |
+
.button {
|
19 |
+
display: inline-block;
|
20 |
+
padding: 10px 20px;
|
21 |
+
font-size: 16px;
|
22 |
+
cursor: pointer;
|
23 |
+
text-align: center;
|
24 |
+
text-decoration: none;
|
25 |
+
outline: none;
|
26 |
+
color: #fff;
|
27 |
+
background-color: #4CAF50;
|
28 |
+
border: none;
|
29 |
+
border-radius: 5px;
|
30 |
+
box-shadow: 0 4px #999;
|
31 |
+
}
|
32 |
+
.button:hover {background-color: #45a049}
|
33 |
+
.button:active {
|
34 |
+
background-color: #3e8e41;
|
35 |
+
box-shadow: 0 2px #666;
|
36 |
+
transform: translateY(2px);
|
37 |
+
}
|
38 |
+
.result {
|
39 |
+
margin-top: 20px;
|
40 |
+
}
|
41 |
+
</style>
|
42 |
+
</head>
|
43 |
+
<body>
|
44 |
+
<div class="container">
|
45 |
+
<h1>Document Classifier</h1>
|
46 |
+
<form id="upload-form" enctype="multipart/form-data">
|
47 |
+
<label for="file">Select a PDF file to classify:</label>
|
48 |
+
<input type="file" id="file" name="file" accept="application/pdf" required>
|
49 |
+
<button type="submit" class="button">Classify Document</button>
|
50 |
+
</form>
|
51 |
+
<button id="train-button" class="button">Train Model</button>
|
52 |
+
<div id="result" class="result"></div>
|
53 |
+
</div>
|
54 |
+
|
55 |
+
<script>
|
56 |
+
document.getElementById('upload-form').onsubmit = async function(event) {
|
57 |
+
event.preventDefault();
|
58 |
+
const fileInput = document.getElementById('file');
|
59 |
+
const formData = new FormData();
|
60 |
+
formData.append('file', fileInput.files[0]);
|
61 |
+
|
62 |
+
const response = await fetch('/classify', {
|
63 |
+
method: 'POST',
|
64 |
+
body: formData
|
65 |
+
});
|
66 |
+
|
67 |
+
const result = await response.json();
|
68 |
+
document.getElementById('result').innerText = 'Label: ' + result.label + '\nText: ' + result.text;
|
69 |
+
};
|
70 |
+
|
71 |
+
document.getElementById('train-button').onclick = async function() {
|
72 |
+
const response = await fetch('/train', {
|
73 |
+
method: 'POST'
|
74 |
+
});
|
75 |
+
|
76 |
+
const result = await response.json();
|
77 |
+
alert(result.message);
|
78 |
+
};
|
79 |
+
</script>
|
80 |
+
</body>
|
81 |
+
</html>
|
uploads/Nouveau document texte.txt
ADDED
File without changes
|