|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
from collections import Counter |
|
from io import BytesIO |
|
from docx import Document |
|
import gradio as gr |
|
|
|
def extract_titles_and_hashtags(file): |
|
try: |
|
|
|
if file.size > 50 * 1024 * 1024: |
|
return "الملف كبير جدًا. الرجاء رفع ملف أصغر.", None, None |
|
|
|
content = file.read().decode('utf-8') if hasattr(file, 'read') else open(file.name, 'r', encoding='utf-8').read() |
|
except Exception as e: |
|
print(f"خطأ أثناء قراءة الملف: {str(e)}") |
|
return f"خطأ أثناء قراءة الملف: {str(e)}", None, None |
|
|
|
try: |
|
|
|
soup = BeautifulSoup(content, 'lxml') |
|
except Exception as e: |
|
print(f"خطأ أثناء تحليل HTML: {str(e)}") |
|
return f"خطأ أثناء تحليل HTML: {str(e)}", None, None |
|
|
|
|
|
data = [] |
|
hashtags_counter = Counter() |
|
max_items = 1000 |
|
|
|
|
|
desc_containers = soup.find_all('div', class_=lambda value: value and 'css-' in value and 'DivDesContainer' in value) |
|
|
|
if not desc_containers: |
|
return "لم يتم العثور على أي بيانات مطابقة.", None, None |
|
|
|
for container in desc_containers[:max_items]: |
|
try: |
|
title = container.get('aria-label', 'بدون عنوان') |
|
|
|
hashtags = [ |
|
tag.get_text(strip=True) |
|
for tag in container.find_all('a') |
|
if tag.get_text(strip=True).startswith('#') |
|
] |
|
hashtags_counter.update(hashtags) |
|
|
|
data.append({"Title": title, "Hashtags": ", ".join(hashtags)}) |
|
except Exception as e: |
|
print(f"خطأ في معالجة عنصر: {str(e)}") |
|
continue |
|
|
|
df_titles = pd.DataFrame(data) |
|
df_hashtags = pd.DataFrame(hashtags_counter.most_common(), columns=["Hashtag", "Count"]) |
|
|
|
return df_titles, df_hashtags |
|
|
|
|
|
def gradio_interface(file, format_choice): |
|
if not file: |
|
return "الرجاء رفع ملف.", None |
|
|
|
try: |
|
df_titles, df_hashtags = extract_titles_and_hashtags(file) |
|
|
|
if isinstance(df_titles, str): |
|
return df_titles, None |
|
|
|
if df_titles is None or df_hashtags is None: |
|
return "لم يتم استخراج أي بيانات.", None |
|
|
|
|
|
titles_html = df_titles.to_html(index=False) if not df_titles.empty else "لا توجد عناوين مستخرجة." |
|
hashtags_html = df_hashtags.to_html(index=False) if not df_hashtags.empty else "لا توجد هاشتاغات مستخرجة." |
|
|
|
|
|
buffer, file_name = create_downloadable_files(df_titles, df_hashtags, format_choice) |
|
|
|
return titles_html + "<br><br>" + hashtags_html, (file_name, buffer) |
|
|
|
except Exception as e: |
|
return f"خطأ غير متوقع: {str(e)}", None |
|
|
|
def create_downloadable_files(df_titles, df_hashtags, format_choice): |
|
if format_choice == "Excel": |
|
buffer = BytesIO() |
|
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer: |
|
df_titles.to_excel(writer, index=False, sheet_name='Titles') |
|
df_hashtags.to_excel(writer, index=False, sheet_name='Hashtags') |
|
buffer.seek(0) |
|
return buffer, "titles_and_hashtags.xlsx" |
|
|
|
elif format_choice == "Word": |
|
buffer = BytesIO() |
|
doc = Document() |
|
doc.add_heading("العناوين والهاشتاغات", level=1) |
|
for _, row in df_titles.iterrows(): |
|
doc.add_paragraph(f"Title: {row['Title']}\nHashtags: {row['Hashtags']}\n") |
|
doc.add_heading("الهاشتاغات وتكرارها", level=1) |
|
for _, row in df_hashtags.iterrows(): |
|
doc.add_paragraph(f"{row['Hashtag']}: {row['Count']}") |
|
doc.save(buffer) |
|
buffer.seek(0) |
|
return buffer, "titles_and_hashtags.docx" |
|
|
|
elif format_choice == "TXT": |
|
buffer = BytesIO() |
|
content = "العناوين والهاشتاغات:\n" |
|
for _, row in df_titles.iterrows(): |
|
content += f"Title: {row['Title']}\nHashtags: {row['Hashtags']}\n\n" |
|
content += "الهاشتاغات وتكرارها:\n" |
|
for _, row in df_hashtags.iterrows(): |
|
content += f"{row['Hashtag']}: {row['Count']}\n" |
|
buffer.write(content.encode('utf-8')) |
|
buffer.seek(0) |
|
return buffer, "titles_and_hashtags.txt" |
|
|
|
|
|
def gradio_interface(file, format_choice): |
|
df_titles, df_hashtags = extract_titles_and_hashtags(file) |
|
|
|
if df_titles is None or df_hashtags is None: |
|
return "لم يتم استخراج أي بيانات.", None |
|
|
|
|
|
titles_html = df_titles.to_html(index=False) if not df_titles.empty else "لا توجد عناوين مستخرجة." |
|
hashtags_html = df_hashtags.to_html(index=False) if not df_hashtags.empty else "لا توجد هاشتاغات مستخرجة." |
|
|
|
|
|
buffer, file_name = create_downloadable_files(df_titles, df_hashtags, format_choice) |
|
|
|
return titles_html + "<br><br>" + hashtags_html, (file_name, buffer) |
|
|
|
|
|
interface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.File(label="ارفع ملف HTML"), |
|
gr.Radio(choices=["TXT", "Excel", "Word"], label="حدد صيغة التنزيل") |
|
], |
|
outputs=[ |
|
gr.HTML(label="العناوين والهاشتاغات المستخرجة"), |
|
gr.File(label="تنزيل النتائج") |
|
], |
|
title="استخراج العناوين والهاشتاغات", |
|
description="ارفع ملف HTML لتحليل العناوين والهاشتاغات مع إمكانية تنزيل النتائج كملف TXT أو Excel أو Word.", |
|
allow_flagging="never" |
|
) |
|
|
|
|
|
interface.launch() |
|
|