Spaces:

MohammedNasser
/

Arabic-PDF-Chat

Runtime error

App Files Files Community

MohammedNasser commited on Sep 16, 2024

Commit

d800d23

verified ·

1 Parent(s): 345a26b

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -79

app.py CHANGED Viewed

@@ -37,63 +37,31 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
     if not os.path.exists(folder):
         os.makedirs(folder)
 def load_pdf(file_path):
     """Load and preprocess Arabic text from a PDF file."""
-    pages = convert_from_path(file_path, 500)
     documents = []
     for pageNum, imgBlob in enumerate(pages):
-        text = pytesseract.image_to_string(imgBlob, lang="ara")
-        documents.append(text)
-    return documents
-import os
-from langchain.vectorstores import FAISS
-from huggingface_hub import Repository
-def save_faiss_index_to_hub(vectorstore, repo_id="MohammedNasser/faiss-index"):
-    index_dir = "faiss_index"
-    # Ensure the index directory exists
-    if not os.path.exists(index_dir):
-        os.makedirs(index_dir)
-    # Save FAISS index locally
-    vectorstore.save_local(index_dir)
-    # Initialize Hugging Face repository
-    repo = Repository(local_dir=index_dir, clone_from=repo_id, repo_type="dataset")
-    # Push the FAISS index files to the Hugging Face Hub
-    repo.push_to_hub(commit_message="Pushing FAISS index")
-    print(f"FAISS index saved to Hugging Face Hub: {repo_id}")
 def prepare_vectorstore(data):
-    index_dir = "faiss_index"
-    if not os.path.exists(index_dir):
-        os.makedirs(index_dir)
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
-    texts = data
     vectorstore = FAISS.from_texts(texts, embeddings)
-    save_faiss_index_to_hub(vectorstore)
-    return vectorstore
-def load_vectorstore(repo_id="MohammedNasser/faiss-index"):
-    index_dir = "faiss_index"
-    # Ensure the index directory exists
-    if not os.path.exists(index_dir):
-        os.makedirs(index_dir)
-    # Download the FAISS index files from Hugging Face Hub
-    hf_hub_download(repo_id=repo_id, filename="index.faiss", local_dir=index_dir, repo_type="dataset")
-    hf_hub_download(repo_id=repo_id, filename="index.json", local_dir=index_dir, repo_type="dataset")
-    vectorstore = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)
     return vectorstore
 def create_chain(vectorstore):
@@ -108,41 +76,56 @@ def create_chain(vectorstore):
         chain_type="map_reduce"
     )
     return chain
 def process_pdf(pdf_file):
     file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
-    with open(file_path, "wb") as f:
-        f.write(pdf_file.read())
-    data = load_pdf(file_path)
-    vectorstore = prepare_vectorstore(data)
-    return "PDF processed successfully. You can now start chatting!"
 def chat(user_input, history):
-    vectorstore = load_vectorstore()
-    chain = create_chain(vectorstore)
-    prompt = f"""
-    You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
-    When responding, ensure the following:
-       - Your answer directly reflects the content of the document.
-       - If the requested information is not available in the document, clearly state that.
-       - Keep your response concise yet comprehensive, addressing the question fully.
-       - Always respond in formal Arabic, without using English.
-    Question: {user_input}
-    Helpful Answer:"""
-    response = chain({"question": prompt})
-    assistant_response = response["answer"]
-    # Generate audio file
-    tts = gTTS(text=assistant_response, lang='ar')
-    audio_file = f"response_{len(history)}.mp3"
-    tts.save(os.path.join(AUDIO_FOLDER, audio_file))
-    return assistant_response, audio_file
 custom_css = """
 body {
     font-family: 'Noto Kufi Arabic', sans-serif;
@@ -216,6 +199,7 @@ p {
     content: '🤖';
 }
 """
 # Gradio interface
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
@@ -251,4 +235,3 @@ with gr.Blocks(css=custom_css) as demo:
 demo.launch()

     if not os.path.exists(folder):
         os.makedirs(folder)
+vectorstore=None
 def load_pdf(file_path):
     """Load and preprocess Arabic text from a PDF file."""
+    try:
+        pages = convert_from_path(file_path, 500)
+    except Exception as e:
+        print(f"Error loading PDF: {e}")
+        return []
     documents = []
     for pageNum, imgBlob in enumerate(pages):
+        try:
+            text = pytesseract.image_to_string(imgBlob, lang="ara")
+            documents.append(text)
+        except Exception as e:
+            print(f"Error processing page {pageNum}: {e}")
+            documents.append("")  # Append empty string for pages where OCR failed
+    return documents
 def prepare_vectorstore(data):
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
+    texts = text_splitter.split_documents(data)
     vectorstore = FAISS.from_texts(texts, embeddings)
     return vectorstore
 def create_chain(vectorstore):
         chain_type="map_reduce"
     )
     return chain
 def process_pdf(pdf_file):
+    global vectorstore
     file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
+    try:
+        with open(file_path, "wb") as f:
+            f.write(pdf_file.read())
+        data = load_pdf(file_path)
+        vectorstore = prepare_vectorstore(data)
+        return "PDF processed successfully. You can now start chatting!"
+    except Exception as e:
+        print(f"Error processing PDF: {e}")
+        return "Error processing PDF."
 def chat(user_input, history):
+    if vectorstore is None:
+        return "Please process a PDF file first.", ""
+    try:
+        chain = create_chain(vectorstore)
+        prompt = f"""
+        You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
+        When responding, ensure the following:
+           - Your answer directly reflects the content of the document.
+           - If the requested information is not available in the document, clearly state that.
+           - Keep your response concise yet comprehensive, addressing the question fully.
+           - Always respond in formal Arabic, without using English.
+        Question: {user_input}
+        Helpful Answer:"""
+        response = chain({"question": prompt})
+        assistant_response = response["answer"]
+        # Generate audio file
+        audio_file = f"response_{len(history)}.mp3"
+        try:
+            tts = gTTS(text=assistant_response, lang='ar')
+            tts.save(os.path.join(AUDIO_FOLDER, audio_file))
+        except Exception as e:
+            print(f"Error generating audio file: {e}")
+            audio_file = ""  # Fallback if audio generation fails
+        return assistant_response, audio_file
+    except Exception as e:
+        print(f"Error during chat: {e}")
+        return "An error occurred while processing your request.", ""
 custom_css = """
 body {
     font-family: 'Noto Kufi Arabic', sans-serif;
     content: '🤖';
 }
 """
 # Gradio interface
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
 demo.launch()