|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
from transformers import pipeline |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.ensemble import IsolationForest |
|
|
|
|
|
summarizer = pipeline("summarization", model="t5-small") |
|
|
|
def analyze_data_quality(file): |
|
df = pd.read_csv(file.name) |
|
|
|
|
|
missing_values = df.isnull().sum() |
|
missing_summary = missing_values[missing_values > 0].to_string() |
|
|
|
|
|
clf = IsolationForest(contamination=0.05, random_state=42) |
|
outliers = clf.fit_predict(df.select_dtypes(include=[np.number])) |
|
outlier_count = (outliers == -1).sum() |
|
|
|
|
|
report = f"🔍 تحليل جودة البيانات:\n\n" |
|
report += f"📌 عدد القيم الناقصة: {missing_values.sum()}\n" |
|
report += f"📌 عدد القيم الشاذة: {outlier_count}\n" |
|
report += "\n📊 تفاصيل القيم الناقصة:\n" + missing_summary if missing_summary else "\n✅ لا توجد قيم ناقصة." |
|
|
|
summary = summarizer(report, max_length=100, do_sample=False)[0]['summary_text'] |
|
|
|
return summary |
|
|
|
def clean_data(file): |
|
df = pd.read_csv(file.name) |
|
|
|
|
|
imputer = SimpleImputer(strategy="mean") |
|
df[df.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(df.select_dtypes(include=[np.number])) |
|
|
|
|
|
clf = IsolationForest(contamination=0.05, random_state=42) |
|
outliers = clf.fit_predict(df.select_dtypes(include=[np.number])) |
|
df_cleaned = df[outliers != -1] |
|
|
|
cleaned_file = "cleaned_data.csv" |
|
df_cleaned.to_csv(cleaned_file, index=False) |
|
return cleaned_file |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("## 🛠️ محلل جودة البيانات") |
|
|
|
file_input = gr.File(label="📂 رفع ملف CSV") |
|
analysis_output = gr.Textbox(label="📊 تقرير جودة البيانات") |
|
cleaned_file_output = gr.File(label="✅ تحميل البيانات المنظفة") |
|
|
|
analyze_button = gr.Button("🔍 تحليل الجودة") |
|
clean_button = gr.Button("🧹 تنظيف البيانات") |
|
|
|
analyze_button.click(analyze_data_quality, inputs=file_input, outputs=analysis_output) |
|
clean_button.click(clean_data, inputs=file_input, outputs=cleaned_file_output) |
|
|
|
app.launch() |
|
|