ZillionParts-PDF2Doc

Sleeping

App Files Files Community

ZillionParts-PDF2Doc / main.py

E-slam

Update main.py

b1ed202 verified 10 months ago

raw

history blame

13.9 kB


	import asyncio
	from flet import *
	import requests
	import json
	import pandas as pd
	import elasticsearch_serverless
	import re
	import os
	import flet_fastapi

	def remove_arabic_diacritics(text):
	diacritics_pattern = re.compile(r'[\u064B-\u065F\u0670\u06D6-\u06ED]')
	no_diacritics_text = re.sub(diacritics_pattern, '', text)
	return no_diacritics_text

	diacritics = re.compile("""
	ّ \| # Tashdid
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""", re.VERBOSE)

	def normalize_arabic(text):
	text = diacritics.sub('', text)
	text = text.replace('أ', 'ا')
	text = text.replace('إ', 'ا')
	text = text.replace('آ', 'ا')
	text = text.replace('ة', 'ه')
	text = text.replace('ى', 'ي')
	return text

	book_selected = False
	first_run = 0
	p1_first_run = 0

	from elasticsearch_serverless import Elasticsearch

	endpoint = "https://503a98874f6241968f251209ab393a45.us-central1.gcp.cloud.es.io:443"

	client = Elasticsearch(
	endpoint,
	api_key="SWZGTU5aQUJuNURpVDRSbmtZSGk6cXRSUFZDZ1lRR2k2Y3NvQW9JYjExUQ",
	request_timeout=60, max_retries=3, retry_on_timeout=True
	)

	async def main(page: Page):

	async def e_search(query):
	query = remove_arabic_diacritics(query)
	query = normalize_arabic(query)

	j_query = {
	"size": 250,
	"query": {
	"match_phrase": {
	"Text": query
	}
	}
	}

	response_search = await asyncio.to_thread(client.search, index="books_idx", body=j_query)
	unique_books = {}
	all_hits = response_search['hits']['hits']
	filtered_hits = [hit for hit in all_hits if query in hit['_source']['Text']]

	for hit in filtered_hits:
	book = hit['_source']['Book']
	page = hit['_source']['Page']
	score = hit['_score']
	if book not in unique_books:
	unique_books[book] = {'Pages': {page: score}, 'Count': 1}
	else:
	if page not in unique_books[book]['Pages']:
	unique_books[book]['Pages'][page] = score
	unique_books[book]['Count'] += 1

	book_data = []
	for book, info in unique_books.items():
	pages = sorted(info['Pages'].items())
	book_data.append({'Book': book, 'Pages': [page for page, _ in pages], 'Scores': [score for _, score in pages], 'Count': info['Count']})

	df = pd.DataFrame(book_data)
	df = df.head(10)

	def get_top_two(row):
	sorted_row = sorted(zip(row['Pages'], row['Scores']), key=lambda x: x[1], reverse=True)
	return [page for page, score in sorted_row[:2]]

	try:
	df['Top Two Pages'] = df.apply(get_top_two, axis=1)
	except:
	pass

	return df, response_search

	inquiry_text = "من فضلك اكتب استفسارك."

	async def e_search_book(query, phrase_search=0):
	if phrase_search == 0:
	book_name = book_btn.text
	else:
	book_name = phrase_search

	url_search = 'http://localhost:9202/books_01/_search'
	query = remove_arabic_diacritics(query)
	query = normalize_arabic(query)

	j_query = {
	"size": 50,
	"query": {
	"bool": {
	"must": [
	{
	"match_phrase": {
	"Text": query
	}
	}
	],
	"filter": [
	{
	"term": {
	"Book.keyword": book_name
	}
	}
	]
	}
	},
	"highlight": {
	"fields": {
	"Text": {}
	}
	}
	}

	response_search = await asyncio.to_thread(client.search, index="books_idx", body=j_query)
	data = []
	for hit in response_search['hits']['hits']:
	book = hit['_source']['Book']
	page = hit['_source']['Page']
	score = hit['_score']
	text = hit['_source']['Text']
	data.append({
	"Book": book,
	"Page": page,
	"Score": score,
	"Text": text
	})

	df = pd.DataFrame(data)
	return df, response_search

	async def navigate_pages(e, page):
	print(page)
	print(df)

	async def p1_page_text_fun(e, response_search, nav="None"):
	p1_datatable_row.visible = False
	p1_page_text.visible = True
	p1_pages_row.visible = True

	if nav == "None":
	p1_pages_row.controls[1].controls[1].value = "رقم الصفحة \n {}".format(e.control.text)
	page_num = e.control.text
	else:
	match = re.search(r'\d+', p1_pages_row.controls[1].controls[1].value)
	if match:
	page_number = match.group()
	page_numbers = [int(item['_source']['Page']) for item in response_search['hits']['hits']]
	page_index = page_numbers.index(int(page_number))
	page_num = page_numbers[(page_index + nav)]
	p1_pages_row.controls[1].controls[1].value = "رقم الصفحة \n {}".format(page_num)

	filtered_data = [item for item in response_search['hits']['hits'] if item['_source']['Page'] == page_num]
	highlight = filtered_data[0]['highlight']['Text']
	txt = filtered_data[0]['_source']['Text']

	highlight_phrases = []
	for item in highlight:
	matches = re.findall(r'<em>(.*?)</em>', item)
	highlight_phrases.extend(matches)

	highlight_phrases = list(set(highlight_phrases))
	for phrase in highlight_phrases:
	emphasized_phrase = f"<em>{phrase}</em>"
	highlighted_text = txt.replace(phrase, emphasized_phrase)

	lines = highlighted_text.split('\n')
	spans = []
	for line in lines:
	parts = re.split(r'(<em>.*?</em>)', line)
	for part in parts:
	if part.startswith('<em>') and part.endswith('</em>'):
	word = part[4:-5]
	spans.append(TextSpan(word, TextStyle(weight=FontWeight.BOLD, color=colors.YELLOW_600)))
	else:
	spans.append(TextSpan(part + "\n"))

	p1_page_text.content.controls[0].spans = spans
	await page.update_async()

	async def p1_bookname(e):
	book_name = e.control.text
	e_search_df, response = await e_search_book(p1_query_feild.value, book_name)

	p1_res_dt.columns.clear()
	p1_res_dt.rows.clear()
	e_search_df = e_search_df[['Text', 'Score', 'Page']]
	occurrences_count = 0
	query = remove_arabic_diacritics(p1_query_feild.value)
	query = normalize_arabic(query)

	for hit in response['hits']['hits']:
	text = hit['_source']['Text']
	occurrences_count += text.count(query)

	p1_info_table.controls = [create_table(response['hits']['hits'][0]['_source']['Book'],
	e_search_df.shape[0],
	occurrences_count,
	342)]

	translation = {"Book": "الكتاب", "Page": "الصفحه", "Score": "درجة التطابق", 'Text': "المحتوي"}

	for i in range(len(e_search_df.columns)):
	p1_res_dt.columns.append(DataColumn(Text(translation[e_search_df.columns[i]])))

	pages_btns = []
	for i in range(e_search_df.shape[0]):
	txt = e_search_df['Text'][i][:80].replace("\n", " ")
	p1_res_dt.rows.append(DataRow(cells=[
	DataCell(Row([Text(f"{txt}...", width=550)])),
	DataCell(Text(e_search_df['Score'][i], width=300)),
	DataCell(ElevatedButton(e_search_df['Page'][i],
	on_click=lambda e, name=response: asyncio.create_task(p1_page_text_fun(e, name)), width=120))
	]))

	next_button = ElevatedButton(
	content=Row(
	controls=[
	Text(" التالي"),
	Icon(name=icons.NAVIGATE_NEXT, size=25),
	],
	alignment=MainAxisAlignment.CENTER
	),
	on_click=lambda e, name=response: asyncio.create_task(p1_page_text_fun(e, name, 1))
	)

	previous_button = ElevatedButton(
	content=Row(
	controls=[
	Icon(name=icons.NAVIGATE_BEFORE, size=25),
	Text("السابق "),
	],
	alignment=MainAxisAlignment.CENTER
	),
	on_click=lambda e, name=response: asyncio.create_task(p1_page_text_fun(e, name, -1))
	)

	page_num_widget = Row([Text(" "), Text("رقم الصفحة \n 50", weight=FontWeight.BOLD, text_align=TextAlign.CENTER), Text(" ")])
	p1_pages_row.controls = [previous_button, page_num_widget, next_button]
	p1_pages_row.visible = False
	await page.update_async()

	def create_table(books, pages, hits, wid):
	def create_cell(content, is_header=False):
	return Container(
	content=Text(content, weight="bold" if is_header else None),
	border=border.all(1, "cyan"),
	padding=padding.all(8),
	border_radius=2,
	alignment=alignment.center,
	width=wid
	)

	header = Row(
	controls=[
	create_cell("التطابقات", is_header=True),
	create_cell("الصفحات", is_header=True),
	create_cell("الكتب", is_header=True)
	],
	alignment="center",
	spacing=0
	)

	values = Row(
	controls=[
	create_cell(hits),
	create_cell(pages),
	create_cell(books)
	],
	alignment="center",
	spacing=0
	)

	table = Column(
	controls=[
	header,
	values
	],
	alignment="center",
	spacing=0
	)

	return table

	async def p1_send_button(e):
	global p1_first_run

	p1_datatable_row.visible = True
	p1_page_text.visible = False
	p1_pages_row.visible = False

	p1_res_dt.columns.clear()
	if p1_first_run >= 1:
	p1_res_dt.rows.clear()

	p1_first_run = 1
	e_search_df, response_search = await e_search(p1_query_feild.value)
	e_search_df = e_search_df[['Top Two Pages', 'Count', 'Pages', 'Book']]

	translation = {"Book": "الكتاب", "Pages": "الصفحات", "Count": "التطابقات", 'Top Two Pages': "أعلى صفحتين متطابقتين"}
	occurrences_count = 0
	query = remove_arabic_diacritics(p1_query_feild.value)
	query = normalize_arabic(query)

	for hit in response_search['hits']['hits']:
	text = hit['_source']['Text']
	occurrences_count += text.count(query)

	p1_info_table.controls = [create_table(e_search_df.shape[0], e_search_df['Count'].sum(), occurrences_count, 342)]

	for i in range(len(e_search_df.columns)):
	p1_res_dt.columns.append(DataColumn(Text(translation[e_search_df.columns[i]])))

	for i in range(e_search_df.shape[0]):
	occurrences_count = 0
	for hit in response_search['hits']['hits']:
	if hit['_source']['Book'] == e_search_df['Book'][i]:
	text = hit['_source']['Text']
	occurrences_count += text.count(query)

	p1_res_dt.rows.append(DataRow(cells=[
	DataCell(Text(e_search_df['Top Two Pages'][i], width=200)),
	DataCell(Text(occurrences_count, width=120)),
	DataCell(Text(e_search_df['Count'][i], width=180)),
	DataCell(ElevatedButton(e_search_df['Book'][i], width=450, on_click=p1_bookname)),
	]))

	await page.update_async()

	p1_res_dt = DataTable(
	columns=[DataColumn(Text())],
	border=border.all(2, "blue"),
	border_radius=10,
	column_spacing=10,
	)

	p1_info_table = Row([Text("")], alignment=MainAxisAlignment.CENTER)
	p1_datatable_row = Column([Row([p1_res_dt], alignment=MainAxisAlignment.CENTER)], alignment=MainAxisAlignment.CENTER, scroll=ScrollMode.ALWAYS, height=398)
	p1_query_feild = TextField(label="Inquiry", hint_text=inquiry_text, expand=True, rtl=True)
	p1_query_send = FloatingActionButton(icon=icons.SEND, on_click=p1_send_button)
	p1_Query_row = Row(controls=[p1_query_feild, p1_query_send])
	p1_page_text = Container(
	content=Column([Text("", rtl=True)], scroll=ScrollMode.ALWAYS),
	margin=10,
	padding=10,
	alignment=alignment.center,
	width=1050,
	height=400,
	border_radius=10,
	border=border.all(1, colors.CYAN),
	)

	page_1 = Column([p1_Query_row, p1_info_table, p1_datatable_row, Row([Text(), p1_page_text, Text()], alignment=MainAxisAlignment.CENTER),
	Row([Text(), p1_pages_row, Text()], alignment=MainAxisAlignment.CENTER)])

	p1_datatable_row.visible = False
	p1_page_text.visible = False
	p1_pages_row.visible = False

	await page.add_async(page_1)

	app = flet_fastapi.app(main)