Spaces:
Sleeping
Sleeping
File size: 11,965 Bytes
bf18e69 bee2b96 631b794 0c4a24a bee2b96 7b54e65 bee2b96 bf18e69 4132a28 3759483 c2777d8 6ebe94a df2c275 bf18e69 bee2b96 2cc1efc 3759483 2cc1efc e70a2d0 345a26b 7b54e65 d187736 7b54e65 bf18e69 bee2b96 bf18e69 bee2b96 bf18e69 cba3641 d800d23 bee2b96 bf18e69 d800d23 cba3641 d800d23 df2c275 c2777d8 d800d23 c2777d8 bee2b96 6ebe94a 1511464 6ebe94a bee2b96 ddcb279 d800d23 bf18e69 c538ff4 e24a24d bf18e69 c538ff4 bf18e69 27c30e6 e24a24d 27c30e6 e24a24d 27c30e6 e24a24d 27c30e6 ddcb279 c538ff4 45672e3 2c2789c 4fe5255 6db4a5e bfd30d8 9b4e2a4 a57ca47 4fe5255 bfd30d8 9b4e2a4 a57ca47 4fe5255 d8a4052 df7a5f5 d8a4052 323e9c2 5251fcd dfc6fbf 9b4e2a4 323e9c2 dfc6fbf 9b4e2a4 ef63d06 f1748f6 b958c67 d800d23 4fa6022 4851309 32dbb49 f348de6 4851309 cba3641 f348de6 6db4a5e 4fa6022 4851309 4fa6022 bfd30d8 cba3641 4fa6022 f348de6 71e197c f348de6 4851309 6c6890f 71e197c 4851309 4fa6022 6db4a5e 6c6890f 4fa6022 f348de6 4fa6022 f348de6 6c6890f bfd30d8 4fa6022 a4ef630 4fa6022 6c88924 b90eaf1 6d432a5 d221ef9 6d432a5 0ca0248 df2c275 1f40368 faa4c1c 77e423c 03cbbaf 0ca0248 6d432a5 a13e359 f516913 1a197f9 6d432a5 7be32ff 6d432a5 4fe5255 dfc6fbf 4fe5255 dfc6fbf cba3641 280d715 cba3641 b958c67 e59070e cba3641 b958c67 e59070e cba3641 a0b5949 cba3641 b958c67 e59070e 596b0f7 6d432a5 a0b5949 cba3641 6d432a5 cba3641 6d432a5 cba3641 1a197f9 4fa6022 f348de6 c1f9d40 f348de6 39c89fc 55a1f5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 |
import gradio as gr
import os
import subprocess
import uuid
import fitz
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from gtts import gTTS
import sys
import pytesseract
from pdf2image import convert_from_path
from huggingface_hub import Repository, login
from huggingface_hub import hf_hub_download
from langchain.schema import Document
from PyPDF2 import PdfReader
from langdetect import detect
# Load environment variables
load_dotenv()
secret_key = os.getenv("GROQ_API_KEY")
hf_key = os.getenv("HF_TOKEN")
os.environ["GROQ_API_KEY"] = secret_key
login(token=hf_key,add_to_git_credential=True)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
# Ensure the necessary folders exist
UPLOAD_FOLDER = 'uploads/'
AUDIO_FOLDER = 'audio/'
for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
if not os.path.exists(folder):
os.makedirs(folder)
def load_pdf(file_path):
"""Load and preprocess Arabic text from a PDF file."""
try:
pages = convert_from_path(file_path, 500)
except Exception as e:
print(f"Error loading PDF: {e}")
return []
documents = []
for pageNum, imgBlob in enumerate(pages):
try:
text = pytesseract.image_to_string(imgBlob, lang="ara")
documents.append(text)
except Exception as e:
print(f"Error processing page {pageNum}: {e}")
documents.append("")
return documents
def prepare_vectorstore(data):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
# Create Document objects from the input data
documents = [Document(page_content=text) for text in data]
# Split the documents into chunks
chunks = text_splitter.split_documents(documents)
# Create the vector store
vectorstore = FAISS.from_documents(chunks, embeddings)
return vectorstore
def create_chain(vectorstore):
llm = ChatGroq(model="gemma2-9b-it", temperature=0)
retriever = vectorstore.as_retriever()
memory = ConversationBufferMemory(llm=llm, output_key="answer", memory_key="chat_history", return_messages=True)
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory,
verbose=False,
chain_type="map_reduce"
)
return chain
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Noto+Kufi+Arabic:wght@400;700&display=swap');
@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&display=swap');
body {
font-family: 'Noto Kufi Arabic', sans-serif;
background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
background-size: cover;
background-position: center;
background-attachment: fixed;
}
.gradio-container {
direction: rtl;
font-family: 'Noto Kufi Arabic', sans-serif;
font-size: 16px;
max-width: 800px !important;
margin: auto !important;
background: rgba(255, 255, 255, 0.9);
border-radius: 20px;
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
backdrop-filter: blur(4px);
border: 1px solid rgba(255, 255, 255, 0.18);
padding: 20px;
}
.gr-textbox input, .gr-textbox textarea {
text-align: right !important; /* Align text to the right */
direction: rtl !important; /* Set RTL text direction */
font-family: 'Cairo', sans-serif !important;
}
.gr-file, .gr-audio {
text-align: right !important; /* Align text to the right */
direction: rtl !important; /* Set RTL text direction */
}
label {
font-size: 14px !important;
color: #000000 !important;
background-color: #EEEEEE;
}
.arabic-chatbox .message.user {
font-family: 'Cairo', sans-serif !important;
background-color: #FFFBE6;
}
.arabic-chatbox .message.bot {
font-family: 'Cairo', sans-serif !important;
background-color: #E7FBE6;
}
#custom-logo {
display: block;
margin-left: auto;
margin-right: auto;
width: 30px; /* Set custom width */
height: 20px; /* Set custom height */
}
.custom-submit-button {
background-color: #E68369 !important;
border: none !important;
border-radius: 5px !important;
padding: 10px 20px !important;
font-size: 16px !important;
cursor: pointer !important;
}
.custom-submit-button:hover {
background-color: white !important;
color: #E6B9A6 !important;
}
#clear_btn {
background-color: #698474;
color: white;
border: none;
border-radius: 5px;
padding: 10px 20px;
font-size: 16px;
cursor: pointer;
}
#clear_btn:hover {
background-color: white;
color: #698474;
}
"""
# Function to check if the file is a valid PDF in Arabic and less than 10MB
def validate_pdf(pdf):
if pdf is None:
return "لم يتم اختيار أي ملف", False
if not pdf.name.endswith(".pdf"):
return "الملف الذي اخترته ليس PDF", False
if os.path.getsize(pdf.name) > 10 * 1024 * 1024:
return "حجم الملف أكبر من 10 ميجا بايت", False
# Check if PDF content is Arabic
reader = PdfReader(pdf.name)
text = ""
for page in reader.pages:
text += page.extract_text()
try:
if detect(text) != "ar":
return "الملف ليس باللغة العربية", False
except:
return "فشل في تحليل اللغة", False
return "الملف صالح للدردشة", True
def upload_pdf(pdf_file):
global vectorstore, chathistory
chathistory = []
data = load_pdf(pdf_file)
vectorstore = prepare_vectorstore(data)
return "تم تحميل الملف بنجاح !", True
def chat(user_input):
global chathistory, vectorstore
if not user_input.strip(): # Check if the input is empty or contains only whitespace
return gr.update(value='<span style="color:red;">الرجاء إدخال سؤال.</span>'), "", None
prompt = f"""
You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
When responding, ensure the following:
- Your answer directly reflects the content of the document.
- If the requested information is not available in the document, clearly state that in Arabic.
- Keep your response concise yet comprehensive, addressing the question fully.
- Respond only in a professional and well-versed Arabic Language.
Question: {user_input}.
"""
chain = create_chain(vectorstore)
response = chain({"question": prompt})
assistant_response = response["answer"]
chathistory.append({"user_content": f"👤 {user_input}", "bot_content": f"🤖 {assistant_response}"})
# Generate a unique identifier for the audio file
audio_id = str(uuid.uuid4())
# Create audio file
tts = gTTS(text=assistant_response, lang='ar')
audio_file = f"{audio_id}.mp3"
tts.save(audio_file)
history_display = [(msg["user_content"], msg["bot_content"]) for msg in chathistory]
return gr.update(value=''), history_display, audio_file
image_path = "logo.png"
with gr.Blocks(css=custom_css) as demo:
with gr.Row():
gr.Image(image_path, show_fullscreen_button=False, show_download_button=False,
show_share_button=False, show_label=False, label='', container=True, height=50, width=50)
with gr.Row():
gr.Markdown("<h1 style='text-align: center; color: #00000;'>المساعد العربي ar-pdf-chat للدردشة 💬</h1>", rtl=True)
with gr.Row():
gr.Markdown("""
<ul style="list-style-type: disc;">
<li style="color: #6C946F; font-size: 12px;">تأكد من اختيار ملف PDF.</li>
<li style="color: #6C946F; font-size: 12px;">حجم الملف يجب أن يكون أقل من 10 ميجابايت.</li>
<li style="color: #6C946F; font-size: 12px;">يجب أن يكون المحتوى باللغة العربية.</li>
</ul>""", rtl=True)
pdf_input = gr.File(label="اختر ملف PDF")
with gr.Row():
output_label = gr.HTML(value='')
with gr.Row():
submit_button_pdf = gr.Button("ارفع الملف", interactive=False)
with gr.Row():
chatbot = gr.Chatbot(label="الشات", height=400, rtl=True, show_copy_all_button=True, layout='bubble', scale=1, bubble_full_width=False)
with gr.Row():
chat_label = gr.HTML(value='')
with gr.Row():
chat_input = gr.Textbox(label="💬", rtl=True, visible=False, placeholder="أدخل سؤالك هنا ..", lines=2)
with gr.Row():
audio_output = gr.Audio(label="🔊", visible=False)
with gr.Row():
submit_button_chat = gr.Button("إرسال", interactive=True, visible=False, elem_classes="custom-submit-button", variant='primary')
with gr.Row():
clear_btn = gr.Button("مسح", interactive=True, visible=False, variant='secondary')
def handle_file_upload(pdf):
output_label.value=''
message, is_valid = validate_pdf(pdf)
color = "red" if not is_valid else "green"
# Update HTML label instead of Textbox
if is_valid:
# Enable the upload button if the file is valid
value=''
return gr.update(value=value), gr.update(interactive=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
else:
value=f'<span style="color:{color}">{message}</span>'
return gr.update(value=value), gr.update(interactive=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
def process_pdf_and_enable_components(pdf):
# Process PDF and activate the other components
output_label.value='<span style="color:blue">جاري معالجة الملف...</span>'
message, is_valid = upload_pdf(pdf)
value=f'<span style="color:green">{message}</span>'
return gr.update(value=value), gr.update(visible=True), gr.update(interactive=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
def clear_chat():
return "", None
# When the user uploads a file, validate it and then allow PDF upload
pdf_input.change(handle_file_upload, inputs=pdf_input, outputs=[output_label,submit_button_pdf, submit_button_chat, chatbot, chat_input, audio_output, clear_btn])
# When the user presses the upload button, process the PDF and enable other components
submit_button_pdf.click(process_pdf_and_enable_components, inputs=pdf_input, outputs=[output_label, submit_button_chat, submit_button_pdf, chatbot, chat_input, audio_output, clear_btn])
clear_btn.click(clear_chat, outputs=[chat_input, audio_output])
# Chat button connection
submit_button_chat.click(chat, inputs=chat_input, outputs=[chat_label, chatbot, audio_output])
# Launch the Gradio app
demo.launch(inbrowser=True)
|