Spaces:
Sleeping
Sleeping
MohammedNasser
commited on
Commit
•
cba3641
1
Parent(s):
c1f9d40
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,8 @@ from pdf2image import convert_from_path
|
|
18 |
from huggingface_hub import Repository, login
|
19 |
from huggingface_hub import hf_hub_download
|
20 |
from langchain.schema import Document
|
21 |
-
|
|
|
22 |
|
23 |
|
24 |
|
@@ -39,8 +40,35 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
|
|
39 |
if not os.path.exists(folder):
|
40 |
os.makedirs(folder)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def load_pdf(file_path):
|
43 |
"""Load and preprocess Arabic text from a PDF file."""
|
|
|
44 |
try:
|
45 |
pages = convert_from_path(file_path, 500)
|
46 |
except Exception as e:
|
@@ -51,6 +79,7 @@ def load_pdf(file_path):
|
|
51 |
for pageNum, imgBlob in enumerate(pages):
|
52 |
try:
|
53 |
text = pytesseract.image_to_string(imgBlob, lang="ara")
|
|
|
54 |
documents.append(text)
|
55 |
except Exception as e:
|
56 |
print(f"Error processing page {pageNum}: {e}")
|
@@ -159,19 +188,17 @@ p {
|
|
159 |
|
160 |
def upload_pdf(pdf_file):
|
161 |
global vectorstore, chain # Use global variables to store state
|
162 |
-
|
163 |
data = load_pdf(pdf_file)
|
164 |
vectorstore = prepare_vectorstore(data)
|
165 |
chain = create_chain(vectorstore)
|
166 |
-
return "تم تحميل الملف بنجاح !"
|
167 |
|
168 |
|
169 |
def chat(user_input):
|
170 |
global chain # Access the global chain variable
|
171 |
|
172 |
-
|
173 |
-
return "Please upload a PDF file first.", None # Prompt user to upload a PDF
|
174 |
-
|
175 |
prompt = f"""
|
176 |
You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
|
177 |
When responding, ensure the following:
|
@@ -193,31 +220,51 @@ def chat(user_input):
|
|
193 |
audio_file = f"{audio_id}.mp3"
|
194 |
tts.save(audio_file)
|
195 |
|
|
|
196 |
return assistant_response, audio_file
|
197 |
|
198 |
with gr.Blocks(css=custom_css) as demo:
|
199 |
|
200 |
pdf_input = gr.File(label="اختر ملف PDF")
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
-
# Connect chat button to chat function
|
220 |
-
submit_button_chat.click(chat, inputs=[chat_input], outputs=[chat_output, audio_output])
|
221 |
|
222 |
# Launch the Gradio app
|
223 |
demo.launch(inbrowser=True)
|
|
|
18 |
from huggingface_hub import Repository, login
|
19 |
from huggingface_hub import hf_hub_download
|
20 |
from langchain.schema import Document
|
21 |
+
from PyPDF2 import PdfReader # Make sure to install PyPDF2 for PDF handling
|
22 |
+
from langdetect import detect # Install langdetect to detect language
|
23 |
|
24 |
|
25 |
|
|
|
40 |
if not os.path.exists(folder):
|
41 |
os.makedirs(folder)
|
42 |
|
43 |
+
|
44 |
+
|
45 |
+
# Function to check if the file is a valid PDF in Arabic and less than 10MB
|
46 |
+
def validate_pdf(pdf):
|
47 |
+
if pdf is None:
|
48 |
+
return "لم يتم اختيار أي ملف", False
|
49 |
+
if not pdf.name.endswith(".pdf"):
|
50 |
+
return "الملف الذي اخترته ليس PDF", False
|
51 |
+
if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
|
52 |
+
return "حجم الملف أكبر من 10 ميجا بايت", False
|
53 |
+
|
54 |
+
# Check if PDF content is Arabic
|
55 |
+
reader = PdfReader(pdf.name)
|
56 |
+
text = ""
|
57 |
+
for page in reader.pages:
|
58 |
+
text += page.extract_text()
|
59 |
+
|
60 |
+
try:
|
61 |
+
if detect(text) != "ar":
|
62 |
+
return "الملف ليس باللغة العربية", False
|
63 |
+
except:
|
64 |
+
return "فشل في تحليل اللغة", False
|
65 |
+
|
66 |
+
return "الملف صالح للدردشة", True
|
67 |
+
|
68 |
+
|
69 |
def load_pdf(file_path):
|
70 |
"""Load and preprocess Arabic text from a PDF file."""
|
71 |
+
|
72 |
try:
|
73 |
pages = convert_from_path(file_path, 500)
|
74 |
except Exception as e:
|
|
|
79 |
for pageNum, imgBlob in enumerate(pages):
|
80 |
try:
|
81 |
text = pytesseract.image_to_string(imgBlob, lang="ara")
|
82 |
+
|
83 |
documents.append(text)
|
84 |
except Exception as e:
|
85 |
print(f"Error processing page {pageNum}: {e}")
|
|
|
188 |
|
189 |
def upload_pdf(pdf_file):
|
190 |
global vectorstore, chain # Use global variables to store state
|
191 |
+
|
192 |
data = load_pdf(pdf_file)
|
193 |
vectorstore = prepare_vectorstore(data)
|
194 |
chain = create_chain(vectorstore)
|
195 |
+
return "تم تحميل الملف بنجاح !", True
|
196 |
|
197 |
|
198 |
def chat(user_input):
|
199 |
global chain # Access the global chain variable
|
200 |
|
201 |
+
|
|
|
|
|
202 |
prompt = f"""
|
203 |
You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
|
204 |
When responding, ensure the following:
|
|
|
220 |
audio_file = f"{audio_id}.mp3"
|
221 |
tts.save(audio_file)
|
222 |
|
223 |
+
|
224 |
return assistant_response, audio_file
|
225 |
|
226 |
with gr.Blocks(css=custom_css) as demo:
|
227 |
|
228 |
pdf_input = gr.File(label="اختر ملف PDF")
|
229 |
+
output_label = gr.HTML() # Replaced Textbox with HTML for label
|
230 |
+
submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
|
231 |
+
chat_input = gr.Textbox(label="أدخل سؤالك هنا", interactive=False)
|
232 |
+
chat_output = gr.Textbox(label="الرد الآلي", interactive=False)
|
233 |
+
audio_output = gr.Audio(label="استمع إلى الرد", interactive=False)
|
234 |
+
submit_button_chat = gr.Button("إرسال", interactive=False)
|
235 |
+
|
236 |
+
def handle_file_upload(pdf):
|
237 |
+
output_label.update('')
|
238 |
+
message, is_valid = validate_pdf(pdf)
|
239 |
+
color = "red" if not is_valid else "green"
|
240 |
+
# Update HTML label instead of Textbox
|
241 |
+
|
242 |
+
if is_valid:
|
243 |
+
# Enable the upload button if the file is valid
|
244 |
+
submit_button_pdf.update(interactive=True)
|
245 |
+
output_label.update('')
|
246 |
+
else:
|
247 |
+
output_label.update(f'<span style="color:{color}">{message}</span>')
|
248 |
+
|
249 |
+
def process_pdf_and_enable_components(pdf):
|
250 |
+
# Process PDF and activate the other components
|
251 |
+
output_label.update('<span style="color:green">جاري معالجة الملف...</span>')
|
252 |
+
message, is_valid = upload_pdf(pdf)
|
253 |
+
output_label.update(f'<span style="color:green">{message}</span>')
|
254 |
+
if is_valid:
|
255 |
+
chat_input.update(interactive=True)
|
256 |
+
chat_output.update(interactive=True)
|
257 |
+
audio_output.update(interactive=True)
|
258 |
+
submit_button_chat.update(interactive=True)
|
259 |
+
# When the user uploads a file, validate it and then allow PDF upload
|
260 |
+
pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label])
|
261 |
+
|
262 |
+
# When the user presses the upload button, process the PDF and enable other components
|
263 |
+
submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label])
|
264 |
+
|
265 |
+
# Chat button connection
|
266 |
+
submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_output, audio_output])
|
267 |
|
|
|
|
|
268 |
|
269 |
# Launch the Gradio app
|
270 |
demo.launch(inbrowser=True)
|