File size: 11,965 Bytes
bf18e69
bee2b96
631b794
0c4a24a
bee2b96
 
 
 
7b54e65
bee2b96
 
 
 
 
bf18e69
4132a28
 
3759483
c2777d8
6ebe94a
df2c275
 
bf18e69
 
bee2b96
 
2cc1efc
3759483
2cc1efc
e70a2d0
345a26b
7b54e65
d187736
7b54e65
bf18e69
bee2b96
bf18e69
bee2b96
 
 
 
 
bf18e69
cba3641
d800d23
 
 
 
 
 
bee2b96
bf18e69
d800d23
 
cba3641
d800d23
 
 
df2c275
c2777d8
d800d23
c2777d8
bee2b96
 
6ebe94a
 
1511464
6ebe94a
 
 
 
 
bee2b96
 
 
 
 
 
 
 
 
 
 
 
 
 
ddcb279
 
d800d23
bf18e69
c538ff4
e24a24d
 
bf18e69
 
 
 
 
 
 
 
 
c538ff4
 
 
bf18e69
 
 
 
 
 
 
 
 
 
 
27c30e6
e24a24d
27c30e6
e24a24d
 
 
27c30e6
 
 
e24a24d
27c30e6
ddcb279
c538ff4
 
45672e3
2c2789c
 
4fe5255
6db4a5e
 
bfd30d8
9b4e2a4
a57ca47
4fe5255
 
bfd30d8
9b4e2a4
a57ca47
4fe5255
 
d8a4052
 
 
 
df7a5f5
 
d8a4052
 
323e9c2
5251fcd
dfc6fbf
 
 
 
 
9b4e2a4
 
323e9c2
dfc6fbf
 
9b4e2a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef63d06
f1748f6
b958c67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d800d23
4fa6022
4851309
32dbb49
f348de6
 
4851309
cba3641
f348de6
6db4a5e
4fa6022
4851309
4fa6022
bfd30d8
 
 
 
cba3641
4fa6022
 
f348de6
 
71e197c
f348de6
4851309
6c6890f
71e197c
4851309
4fa6022
 
 
6db4a5e
 
6c6890f
4fa6022
 
f348de6
4fa6022
 
 
 
f348de6
6c6890f
bfd30d8
4fa6022
a4ef630
4fa6022
6c88924
b90eaf1
 
6d432a5
d221ef9
6d432a5
 
0ca0248
df2c275
 
1f40368
 
 
faa4c1c
77e423c
03cbbaf
0ca0248
6d432a5
 
a13e359
f516913
1a197f9
 
6d432a5
7be32ff
6d432a5
 
4fe5255
 
dfc6fbf
4fe5255
dfc6fbf
cba3641
 
280d715
cba3641
 
 
 
 
 
b958c67
e59070e
 
cba3641
b958c67
e59070e
cba3641
 
 
a0b5949
cba3641
b958c67
e59070e
596b0f7
 
6d432a5
a0b5949
cba3641
6d432a5
cba3641
 
6d432a5
 
cba3641
1a197f9
4fa6022
f348de6
 
c1f9d40
f348de6
39c89fc
55a1f5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import gradio as gr
import os
import subprocess
import uuid
import fitz
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from gtts import gTTS
import sys
import pytesseract
from pdf2image import convert_from_path
from huggingface_hub import Repository, login
from huggingface_hub import hf_hub_download
from langchain.schema import Document
from PyPDF2 import PdfReader  
from langdetect import detect  


# Load environment variables
load_dotenv()
secret_key = os.getenv("GROQ_API_KEY")
hf_key = os.getenv("HF_TOKEN")

os.environ["GROQ_API_KEY"] = secret_key
login(token=hf_key,add_to_git_credential=True)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Ensure the necessary folders exist
UPLOAD_FOLDER = 'uploads/'
AUDIO_FOLDER = 'audio/'
for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

def load_pdf(file_path):
    """Load and preprocess Arabic text from a PDF file."""
    
    try:
        pages = convert_from_path(file_path, 500)
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return []

    documents = []
    for pageNum, imgBlob in enumerate(pages):
        try:
            text = pytesseract.image_to_string(imgBlob, lang="ara")
            
            documents.append(text)
        except Exception as e:
            print(f"Error processing page {pageNum}: {e}")
            documents.append("")  

    return documents

def prepare_vectorstore(data):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
    # Create Document objects from the input data
    documents = [Document(page_content=text) for text in data]
    
    # Split the documents into chunks
    chunks = text_splitter.split_documents(documents)
    
    # Create the vector store
    vectorstore = FAISS.from_documents(chunks, embeddings)
    return vectorstore

def create_chain(vectorstore):
    llm = ChatGroq(model="gemma2-9b-it", temperature=0)
    retriever = vectorstore.as_retriever()
    memory = ConversationBufferMemory(llm=llm, output_key="answer", memory_key="chat_history", return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=False,
        chain_type="map_reduce"
    )
    return chain


    
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Noto+Kufi+Arabic:wght@400;700&display=swap');
@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&display=swap');

body {
    font-family: 'Noto Kufi Arabic', sans-serif;
    background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
    background-size: cover;
    background-position: center;
    background-attachment: fixed;
}

.gradio-container {
    direction: rtl;
    font-family: 'Noto Kufi Arabic', sans-serif;
    font-size: 16px;
    max-width: 800px !important;
    margin: auto !important;
    background: rgba(255, 255, 255, 0.9);
    border-radius: 20px;
    box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
    backdrop-filter: blur(4px);
    border: 1px solid rgba(255, 255, 255, 0.18);
    padding: 20px;
}


.gr-textbox input, .gr-textbox textarea {
    text-align: right !important;  /* Align text to the right */
    direction: rtl !important;     /* Set RTL text direction */
    font-family: 'Cairo', sans-serif !important;
}



.gr-file, .gr-audio {
    text-align: right !important;  /* Align text to the right */
    direction: rtl !important;     /* Set RTL text direction */
}

label {
    font-size: 14px !important;
    color: #000000 !important;
    background-color: #EEEEEE;
}


.arabic-chatbox .message.user {
    font-family: 'Cairo', sans-serif !important;
    background-color: #FFFBE6; 
}

.arabic-chatbox .message.bot {
    font-family: 'Cairo', sans-serif !important;
    background-color: #E7FBE6; 
}

#custom-logo {
    display: block;
    margin-left: auto;
    margin-right: auto;
    width: 30px;  /* Set custom width */
    height: 20px; /* Set custom height */
}

.custom-submit-button {
    background-color: #E68369 !important;
    border: none !important;
    border-radius: 5px !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    cursor: pointer !important;
}

.custom-submit-button:hover {
    background-color: white !important;
    color: #E6B9A6 !important;
}

#clear_btn {
    background-color: #698474;
    color: white;
    border: none;
    border-radius: 5px;
    padding: 10px 20px;
    font-size: 16px;
    cursor: pointer;
}

#clear_btn:hover {
    background-color: white;
    color: #698474;
}

"""

# Function to check if the file is a valid PDF in Arabic and less than 10MB
def validate_pdf(pdf):
    if pdf is None:
        return "لم يتم اختيار أي ملف", False
    if not pdf.name.endswith(".pdf"):
        return "الملف الذي اخترته ليس PDF", False
    if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
        return "حجم الملف أكبر من 10 ميجا بايت", False
    
    # Check if PDF content is Arabic
    reader = PdfReader(pdf.name)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    try:
        if detect(text) != "ar":
            return "الملف ليس باللغة العربية", False
    except:
        return "فشل في تحليل اللغة", False
    
    return "الملف صالح للدردشة", True

def upload_pdf(pdf_file):
    global vectorstore, chathistory  
    chathistory = []
    data = load_pdf(pdf_file)
    vectorstore = prepare_vectorstore(data)
    
    return "تم تحميل الملف بنجاح !", True

        
def chat(user_input):
    global chathistory, vectorstore

    if not user_input.strip():  # Check if the input is empty or contains only whitespace
        return gr.update(value='<span style="color:red;">الرجاء إدخال سؤال.</span>'), "", None


   
    prompt = f"""
        You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
        When responding, ensure the following:
           - Your answer directly reflects the content of the document.
           - If the requested information is not available in the document, clearly state that in Arabic.
           - Keep your response concise yet comprehensive, addressing the question fully.
           - Respond only in a professional and well-versed Arabic Language.
        Question: {user_input}.
        """
    chain = create_chain(vectorstore)
    response = chain({"question": prompt})
    assistant_response = response["answer"]

    

    chathistory.append({"user_content":  f"👤 {user_input}", "bot_content": f"🤖 {assistant_response}"})
    # Generate a unique identifier for the audio file
    audio_id = str(uuid.uuid4())

    # Create audio file
    tts = gTTS(text=assistant_response, lang='ar')
    audio_file = f"{audio_id}.mp3"
    tts.save(audio_file)

    history_display = [(msg["user_content"], msg["bot_content"]) for msg in chathistory]
    return gr.update(value=''), history_display, audio_file

image_path = "logo.png"
with gr.Blocks(css=custom_css) as demo:
    with gr.Row():
        gr.Image(image_path, show_fullscreen_button=False, show_download_button=False, 
                 show_share_button=False, show_label=False, label='', container=True, height=50, width=50)
    with gr.Row():
        gr.Markdown("<h1 style='text-align: center; color: #00000;'>المساعد العربي ar-pdf-chat للدردشة 💬</h1>", rtl=True)
    
    with gr.Row():
        gr.Markdown("""
                    
                    <ul style="list-style-type: disc;"> 
                        <li style="color: #6C946F; font-size: 12px;">تأكد من اختيار ملف PDF.</li>
                        <li style="color: #6C946F; font-size: 12px;">حجم الملف يجب أن يكون أقل من 10 ميجابايت.</li>
                        <li style="color: #6C946F; font-size: 12px;">يجب أن يكون المحتوى باللغة العربية.</li>
                    </ul>""", rtl=True)
        pdf_input = gr.File(label="اختر ملف PDF")
    with gr.Row():
        output_label = gr.HTML(value='')  
    with gr.Row():
        submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
    with gr.Row():   
        chatbot = gr.Chatbot(label="الشات", height=400, rtl=True, show_copy_all_button=True, layout='bubble', scale=1, bubble_full_width=False)
    with gr.Row():
        chat_label = gr.HTML(value='')  
    with gr.Row():
        chat_input = gr.Textbox(label="💬", rtl=True, visible=False,  placeholder="أدخل سؤالك هنا ..", lines=2)
    with gr.Row():
        audio_output = gr.Audio(label="🔊", visible=False)
    
    with gr.Row():
        submit_button_chat = gr.Button("إرسال", interactive=True, visible=False, elem_classes="custom-submit-button", variant='primary')
    with gr.Row():
        clear_btn = gr.Button("مسح", interactive=True, visible=False, variant='secondary')

    def handle_file_upload(pdf):
        output_label.value=''
        message, is_valid = validate_pdf(pdf)
        color = "red" if not is_valid else "green"
        # Update HTML label instead of Textbox
        
        if is_valid:
            # Enable the upload button if the file is valid
            value=''
            return gr.update(value=value), gr.update(interactive=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    
        else:
            value=f'<span style="color:{color}">{message}</span>'
            return gr.update(value=value), gr.update(interactive=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

    def process_pdf_and_enable_components(pdf):
        # Process PDF and activate the other components
        output_label.value='<span style="color:blue">جاري معالجة الملف...</span>'
        message, is_valid = upload_pdf(pdf)
        value=f'<span style="color:green">{message}</span>'
        return gr.update(value=value), gr.update(visible=True), gr.update(interactive=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

    def clear_chat():
        return "", None 
       
     # When the user uploads a file, validate it and then allow PDF upload
    pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label,submit_button_pdf, submit_button_chat, chatbot, chat_input, audio_output, clear_btn])

    # When the user presses the upload button, process the PDF and enable other components
    submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label, submit_button_chat, submit_button_pdf, chatbot, chat_input, audio_output, clear_btn])
    clear_btn.click(clear_chat, outputs=[chat_input, audio_output])
    # Chat button connection
    submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_label, chatbot, audio_output])
    

# Launch the Gradio app
demo.launch(inbrowser=True)