Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import subprocess | |
import uuid | |
import fitz | |
from dotenv import load_dotenv | |
from langchain_community.document_loaders import UnstructuredPDFLoader | |
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_groq import ChatGroq | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from gtts import gTTS | |
import sys | |
import pytesseract | |
from pdf2image import convert_from_path | |
from huggingface_hub import Repository, login | |
from huggingface_hub import hf_hub_download | |
from langchain.schema import Document | |
# Load environment variables | |
load_dotenv() | |
secret_key = os.getenv("GROQ_API_KEY") | |
hf_key = os.getenv("HF_TOKEN") | |
os.environ["GROQ_API_KEY"] = secret_key | |
login(token=hf_key,add_to_git_credential=True) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") | |
# Ensure the necessary folders exist | |
UPLOAD_FOLDER = 'uploads/' | |
AUDIO_FOLDER = 'audio/' | |
for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]: | |
if not os.path.exists(folder): | |
os.makedirs(folder) | |
def load_pdf(file_path): | |
"""Load and preprocess Arabic text from a PDF file.""" | |
try: | |
pages = convert_from_path(file_path, 500) | |
except Exception as e: | |
print(f"Error loading PDF: {e}") | |
return [] | |
documents = [] | |
for pageNum, imgBlob in enumerate(pages): | |
try: | |
text = pytesseract.image_to_string(imgBlob, lang="ara") | |
documents.append(text) | |
except Exception as e: | |
print(f"Error processing page {pageNum}: {e}") | |
documents.append("") # Append empty string for pages where OCR failed | |
return documents | |
def prepare_vectorstore(data): | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n") | |
# Create Document objects from the input data | |
documents = [Document(page_content=text) for text in data] | |
# Split the documents into chunks | |
chunks = text_splitter.split_documents(documents) | |
# Create the vector store | |
vectorstore = FAISS.from_documents(chunks, embeddings) | |
return vectorstore | |
def create_chain(vectorstore): | |
llm = ChatGroq(model="gemma2-9b-it", temperature=0) | |
retriever = vectorstore.as_retriever() | |
memory = ConversationBufferMemory(llm=llm, output_key="answer", memory_key="chat_history", return_messages=True) | |
chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=retriever, | |
memory=memory, | |
verbose=False, | |
chain_type="map_reduce" | |
) | |
return chain | |
custom_css = """ | |
body { | |
font-family: 'Noto Kufi Arabic', sans-serif; | |
background: linear-gradient(135deg, #799351 0%, #A67B5B 100%); | |
background-size: cover; | |
background-position: center; | |
background-attachment: fixed; | |
} | |
.gradio-container { | |
max-width: 800px !important; | |
margin: auto !important; | |
background: rgba(255, 255, 255, 0.9); | |
border-radius: 20px; | |
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37); | |
backdrop-filter: blur(4px); | |
border: 1px solid rgba(255, 255, 255, 0.18); | |
padding: 20px; | |
} | |
h1, h2, h3 { | |
color: #1A4D2E; | |
font-weight: bold; | |
text-align: center; | |
} | |
p { | |
color: #A89F91; | |
} | |
.gradio-button { | |
background-color: #5F6F65 !important; | |
color: #FFFFFF !important; | |
} | |
.gradio-button:hover { | |
background-color: #FFFFFF !important; | |
color: #5F6F65 !important; | |
} | |
.chat-message { | |
border-radius: 10px; | |
padding: 10px; | |
margin-bottom: 10px; | |
} | |
.chat-message.user { | |
background-color: #E7F0DC; | |
} | |
.chat-message.bot { | |
background-color: #F7EED3; | |
} | |
.chat-message::before { | |
content: ''; | |
display: inline-block; | |
width: 24px; | |
height: 24px; | |
background-size: contain; | |
background-repeat: no-repeat; | |
margin-right: 10px; | |
vertical-align: middle; | |
} | |
.chat-message.user::before { | |
content: '👤'; | |
} | |
.chat-message.bot::before { | |
content: '🤖'; | |
} | |
""" | |
# Create the Gradio interface | |
with gr.Blocks(css=custom_css) as demo: | |
pdf_input = gr.File(label="ارففع ملف PDF") | |
chat_input = gr.Textbox(label="أدخل سؤالك هنا") | |
chat_output = gr.Textbox(label="الرد الآلي") | |
audio_output = gr.Audio(label="استمع إلى الرد") | |
submit_button = gr.Button("إرسال") | |
data = load_pdf(pdf_file) | |
vectorstore = prepare_vectorstore(data) | |
# Define the logic for processing the PDF and generating responses | |
def process_pdf_and_chat(pdf_file, user_input): | |
chain = create_chain(vectorstore) | |
prompt = f""" | |
You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language. | |
When responding, ensure the following: | |
- Your answer directly reflects the content of the document. | |
- If the requested information is not available in the document, clearly state that. | |
- Keep your response concise yet comprehensive, addressing the question fully. | |
- Always respond in formal Arabic, without using English.\n | |
Question: {user_input}\n | |
Helpful Answer:""" | |
response = chain({"question": prompt}) | |
assistant_response = response["answer"] | |
# Generate a unique identifier for the audio file | |
audio_id = str(uuid.uuid4()) | |
# Create audio file | |
tts = gTTS(text=assistant_response, lang='ar') | |
audio_file = f"{audio_id}.mp3" | |
tts.save(audio_file) | |
return assistant_response, audio_file | |
# Connect the button to the processing function | |
submit_button.click(process_pdf_and_chat, inputs=[pdf_input, chat_input], outputs=[chat_output, audio_output]) | |
# Launch the Gradio app | |
demo.launch() | |