Spaces:

MohammedNasser
/

Arabic-PDF-Chat

Runtime error

App Files Files Community

MohammedNasser commited on Sep 15, 2024

Commit

bf18e69

verified ·

1 Parent(s): 3ba55e9

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -81

app.py CHANGED Viewed

@@ -1,46 +1,55 @@
 import os
 import fitz
 from dotenv import load_dotenv
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_text_splitters import CharacterTextSplitter
 from langchain_groq import ChatGroq
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
-from pdf2image import convert_from_path
-import pytesseract
 from gtts import gTTS
-import uuid
-import gradio as gr
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
 # Load environment variables
 load_dotenv()
 secret_key = os.getenv("GROQ_API_KEY")
 os.environ["GROQ_API_KEY"] = secret_key
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
-# File directories
 UPLOAD_FOLDER = 'uploads/'
-AUDIO_FOLDER = 'static/audio/'
-# Ensure directories exist
 for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
     if not os.path.exists(folder):
         os.makedirs(folder)
 def load_pdf(file_path):
-    """
-    Load and preprocess Arabic text from a PDF file.
-    """
     pages = convert_from_path(file_path, 500)
     documents = []
-    for imgBlob in pages:
-        # Perform OCR on each image
         text = pytesseract.image_to_string(imgBlob, lang="ara")
         documents.append(text)
     return documents
@@ -49,10 +58,7 @@ def prepare_vectorstore(data):
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
     texts = data
     vectorstore = FAISS.from_texts(texts, embeddings)
-    # Save FAISS index to disk
     vectorstore.save_local("faiss_index")
     return vectorstore
 def load_vectorstore():
@@ -73,73 +79,138 @@ def create_chain(vectorstore):
     return chain
 def process_pdf(pdf_file):
-    if pdf_file is not None:
-        file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
-        pdf_file.save(file_path)
-        # Load PDF, prepare vectorstore
-        data = load_pdf(file_path)
-        vectorstore = prepare_vectorstore(data)
-        chain = create_chain(vectorstore)
-        return chain, f"تم تحميل الملف '{pdf_file.name}' بنجاح!"
-    return None, "الرجاء تحميل ملف PDF ."
-def chat_with_bot(user_input, chain):
-    if chain is None:
-        return "يرجى تحميل ملف PDF أولاً."
-    prompt=f"""
-        You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
-        When responding, ensure the following:
-           - Your answer directly reflects the content of the document.
-           - If the requested information is not available in the document, clearly state that.
-           - Keep your response concise yet comprehensive, addressing the question fully.
-           - Always respond in formal Arabic, without using English.\n
-        Question: {user_input}\n
-        Helpful Answer:"""
     response = chain({"question": prompt})
     assistant_response = response["answer"]
-    # Generate and save audio response
-    audio_id = str(uuid.uuid4())
-    audio_file = f"{audio_id}.mp3"
     tts = gTTS(text=assistant_response, lang='ar')
     tts.save(os.path.join(AUDIO_FOLDER, audio_file))
-    return assistant_response, f"{AUDIO_FOLDER}/{audio_file}"
-# Gradio app interface
-def chatbot_interface(pdf_file, user_input):
-    chain, message = process_pdf(pdf_file)
-    if user_input and chain:
-        response_text, audio_path = chat_with_bot(user_input, chain)
-        return response_text, audio_path
-    else:
-        return "يرجى إدخال السؤال.", None
-with gr.Blocks() as demo:
-    gr.Markdown("<h1 style='text-align:center;'>الشات بوت العربي لـ PDF</h1>")
     with gr.Row():
-        pdf_input = gr.File(label="اختر ملف 📑 PDF للدردشة", type="filepath")
-    with gr.Row():
-        user_input = gr.Textbox(label="سؤالك")
-    with gr.Row():
-        submit_button = gr.Button("رفع وبدء الدردشة")
-    with gr.Row():
-        output_text = gr.Textbox(label="الجواب")
-        audio_output = gr.Audio(label="الرد الصوتي")
-    submit_button.click(chatbot_interface, inputs=[pdf_input, user_input], outputs=[output_text, audio_output])
 demo.launch()

+import gradio as gr
 import os
 import fitz
 from dotenv import load_dotenv
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_text_splitters import CharacterTextSplitter
 from langchain_groq import ChatGroq
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from gtts import gTTS
+import sys
+try:
+    import pytesseract
+    from pdf2image import convert_from_path
+except ImportError as e:
+    print(f"Error: {e}. Please make sure all system dependencies are installed.")
+    sys.exit(1)
+# Rest of your imports...
+# Set the Tesseract path
+pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
+# Test Tesseract installation
+try:
+    pytesseract.get_languages()
+except pytesseract.TesseractNotFoundError:
+    print("Error: Tesseract is not installed or not in the system PATH.")
+    sys.exit(1)
 # Load environment variables
 load_dotenv()
 secret_key = os.getenv("GROQ_API_KEY")
 os.environ["GROQ_API_KEY"] = secret_key
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+# Ensure the necessary folders exist
 UPLOAD_FOLDER = 'uploads/'
+AUDIO_FOLDER = 'audio/'
 for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
     if not os.path.exists(folder):
         os.makedirs(folder)
 def load_pdf(file_path):
+    """Load and preprocess Arabic text from a PDF file."""
     pages = convert_from_path(file_path, 500)
     documents = []
+    for pageNum, imgBlob in enumerate(pages):
         text = pytesseract.image_to_string(imgBlob, lang="ara")
         documents.append(text)
     return documents
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
     texts = data
     vectorstore = FAISS.from_texts(texts, embeddings)
     vectorstore.save_local("faiss_index")
     return vectorstore
 def load_vectorstore():
     return chain
 def process_pdf(pdf_file):
+    file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
+    with open(file_path, "wb") as f:
+        f.write(pdf_file.read())
+    data = load_pdf(file_path)
+    vectorstore = prepare_vectorstore(data)
+    return "PDF processed successfully. You can now start chatting!"
+def chat(user_input, history):
+    vectorstore = load_vectorstore()
+    chain = create_chain(vectorstore)
+    prompt = f"""
+    You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
+    When responding, ensure the following:
+       - Your answer directly reflects the content of the document.
+       - If the requested information is not available in the document, clearly state that.
+       - Keep your response concise yet comprehensive, addressing the question fully.
+       - Always respond in formal Arabic, without using English.
+    Question: {user_input}
+    Helpful Answer:"""
     response = chain({"question": prompt})
     assistant_response = response["answer"]
+    # Generate audio file
     tts = gTTS(text=assistant_response, lang='ar')
+    audio_file = f"response_{len(history)}.mp3"
     tts.save(os.path.join(AUDIO_FOLDER, audio_file))
+    return assistant_response, audio_file
+custom_css = """
+body {
+    font-family: 'Noto Kufi Arabic', sans-serif;
+    background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
+    background-size: cover;
+    background-position: center;
+    background-attachment: fixed;
+}
+.gradio-container {
+    max-width: 800px !important;
+    margin: auto !important;
+    background: rgba(255, 255, 255, 0.9);
+    border-radius: 20px;
+    box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
+    backdrop-filter: blur(4px);
+    border: 1px solid rgba(255, 255, 255, 0.18);
+    padding: 20px;
+}
+h1, h2, h3 {
+    color: #1A4D2E;
+    font-weight: bold;
+    text-align: center;
+}
+p {
+    color: #A89F91;
+}
+.gradio-button {
+    background-color: #5F6F65 !important;
+    color: #FFFFFF !important;
+}
+.gradio-button:hover {
+    background-color: #FFFFFF !important;
+    color: #5F6F65 !important;
+}
+.chat-message {
+    border-radius: 10px;
+    padding: 10px;
+    margin-bottom: 10px;
+}
+.chat-message.user {
+    background-color: #E7F0DC;
+}
+.chat-message.bot {
+    background-color: #F7EED3;
+}
+.chat-message::before {
+    content: '';
+    display: inline-block;
+    width: 24px;
+    height: 24px;
+    background-size: contain;
+    background-repeat: no-repeat;
+    margin-right: 10px;
+    vertical-align: middle;
+}
+.chat-message.user::before {
+    content: '👤';
+}
+.chat-message.bot::before {
+    content: '🤖';
+}
+"""
+# Gradio interface
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
+    gr.Markdown("## المنعقد السبت 14 - سبتمبر 2024")
     with gr.Row():
+        pdf_input = gr.File(label="اختر ملف PDF للدردشة")
+        process_button = gr.Button("رفع وبدء الدردشة")
+    chat_interface = gr.ChatInterface(
+        chat,
+        chatbot=gr.Chatbot(height=400),
+        textbox=gr.Textbox(placeholder="اكتب سؤالك هنا...", container=False),
+        title="الدردشة مع البوت",
+        description="اسأل أي سؤال عن محتوى الملف PDF",
+        theme="soft",
+        examples=["ما هو موضوع الوثيقة؟", "من هم الأشخاص المذكورون؟", "ما هي التواريخ الرئيسية المذكورة؟"],
+        cache_examples=True,
+        retry_btn=None,
+        undo_btn="مسح آخر رسالة",
+        clear_btn="مسح المحادثة",
+    )
+    audio_output = gr.Audio(label="الرد الصوتي")
+    process_button.click(process_pdf, inputs=[pdf_input], outputs=[chat_interface.textbox])
+    chat_interface.submit(lambda x, y: y[-1][1], inputs=[chat_interface.textbox, chat_interface.chatbot], outputs=[audio_output])
 demo.launch()