Spaces:

KIMOSSINO
/

hashtags

Sleeping

App Files Files Community

hashtags / app.py

KIMOSSINO

Update app.py

664290f verified 7 months ago

raw

history blame

6.6 kB

	from bs4 import BeautifulSoup
	import pandas as pd
	from collections import Counter
	from io import BytesIO
	from docx import Document
	import gradio as gr

	def extract_titles_and_hashtags(file):
	try:
	# قراءة محتوى الملف مع التحكم في حجم الملف
	if file.size > 50 * 1024 * 1024: # تحديد الحد الأقصى لـ 50 ميجابايت
	return "الملف كبير جدًا. الرجاء رفع ملف أصغر.", None, None

	content = file.read().decode('utf-8') if hasattr(file, 'read') else open(file.name, 'r', encoding='utf-8').read()
	except Exception as e:
	print(f"خطأ أثناء قراءة الملف: {str(e)}")
	return f"خطأ أثناء قراءة الملف: {str(e)}", None, None

	try:
	# تحليل HTML باستخدام BeautifulSoup مع محدد أسرع
	soup = BeautifulSoup(content, 'lxml') # استخدام lxml بدلاً من html.parser للسرعة
	except Exception as e:
	print(f"خطأ أثناء تحليل HTML: {str(e)}")
	return f"خطأ أثناء تحليل HTML: {str(e)}", None, None

	# استخراج البيانات مع تحديد العدد الأقصى للعناصر
	data = []
	hashtags_counter = Counter()
	max_items = 1000 # تحديد الحد الأقصى للعناصر المستخرجة

	# العثور على الحاويات مع استخدام محدد أكثر مرونة
	desc_containers = soup.find_all('div', class_=lambda value: value and 'css-' in value and 'DivDesContainer' in value)

	if not desc_containers:
	return "لم يتم العثور على أي بيانات مطابقة.", None, None

	for container in desc_containers[:max_items]:
	try:
	title = container.get('aria-label', 'بدون عنوان')

	hashtags = [
	tag.get_text(strip=True)
	for tag in container.find_all('a')
	if tag.get_text(strip=True).startswith('#')
	]
	hashtags_counter.update(hashtags)

	data.append({"Title": title, "Hashtags": ", ".join(hashtags)})
	except Exception as e:
	print(f"خطأ في معالجة عنصر: {str(e)}")
	continue

	df_titles = pd.DataFrame(data)
	df_hashtags = pd.DataFrame(hashtags_counter.most_common(), columns=["Hashtag", "Count"])

	return df_titles, df_hashtags

	# تعديل على دالة Gradio للتعامل مع الأخطاء
	def gradio_interface(file, format_choice):
	if not file:
	return "الرجاء رفع ملف.", None

	try:
	df_titles, df_hashtags = extract_titles_and_hashtags(file)

	if isinstance(df_titles, str): # في حالة وجود رسالة خطأ
	return df_titles, None

	if df_titles is None or df_hashtags is None:
	return "لم يتم استخراج أي بيانات.", None

	# عرض النتائج
	titles_html = df_titles.to_html(index=False) if not df_titles.empty else "لا توجد عناوين مستخرجة."
	hashtags_html = df_hashtags.to_html(index=False) if not df_hashtags.empty else "لا توجد هاشتاغات مستخرجة."

	# إنشاء الملف للتنزيل
	buffer, file_name = create_downloadable_files(df_titles, df_hashtags, format_choice)

	return titles_html + "<br><br>" + hashtags_html, (file_name, buffer)

	except Exception as e:
	return f"خطأ غير متوقع: {str(e)}", None
	# إنشاء ملفات للتنزيل
	def create_downloadable_files(df_titles, df_hashtags, format_choice):
	if format_choice == "Excel":
	buffer = BytesIO()
	with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
	df_titles.to_excel(writer, index=False, sheet_name='Titles')
	df_hashtags.to_excel(writer, index=False, sheet_name='Hashtags')
	buffer.seek(0)
	return buffer, "titles_and_hashtags.xlsx"

	elif format_choice == "Word":
	buffer = BytesIO()
	doc = Document()
	doc.add_heading("العناوين والهاشتاغات", level=1)
	for _, row in df_titles.iterrows():
	doc.add_paragraph(f"Title: {row['Title']}\nHashtags: {row['Hashtags']}\n")
	doc.add_heading("الهاشتاغات وتكرارها", level=1)
	for _, row in df_hashtags.iterrows():
	doc.add_paragraph(f"{row['Hashtag']}: {row['Count']}")
	doc.save(buffer)
	buffer.seek(0)
	return buffer, "titles_and_hashtags.docx"

	elif format_choice == "TXT":
	buffer = BytesIO()
	content = "العناوين والهاشتاغات:\n"
	for _, row in df_titles.iterrows():
	content += f"Title: {row['Title']}\nHashtags: {row['Hashtags']}\n\n"
	content += "الهاشتاغات وتكرارها:\n"
	for _, row in df_hashtags.iterrows():
	content += f"{row['Hashtag']}: {row['Count']}\n"
	buffer.write(content.encode('utf-8'))
	buffer.seek(0)
	return buffer, "titles_and_hashtags.txt"

	# واجهة Gradio
	def gradio_interface(file, format_choice):
	df_titles, df_hashtags = extract_titles_and_hashtags(file)

	if df_titles is None or df_hashtags is None:
	return "لم يتم استخراج أي بيانات.", None

	# عرض النتائج
	titles_html = df_titles.to_html(index=False) if not df_titles.empty else "لا توجد عناوين مستخرجة."
	hashtags_html = df_hashtags.to_html(index=False) if not df_hashtags.empty else "لا توجد هاشتاغات مستخرجة."

	# إنشاء الملف للتنزيل
	buffer, file_name = create_downloadable_files(df_titles, df_hashtags, format_choice)

	return titles_html + "<br><br>" + hashtags_html, (file_name, buffer)

	# إعداد واجهة Gradio
	interface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.File(label="ارفع ملف HTML"),
	gr.Radio(choices=["TXT", "Excel", "Word"], label="حدد صيغة التنزيل")
	],
	outputs=[
	gr.HTML(label="العناوين والهاشتاغات المستخرجة"),
	gr.File(label="تنزيل النتائج")
	],
	title="استخراج العناوين والهاشتاغات",
	description="ارفع ملف HTML لتحليل العناوين والهاشتاغات مع إمكانية تنزيل النتائج كملف TXT أو Excel أو Word.",
	allow_flagging="never"
	)

	# تشغيل التطبيق
	interface.launch()