Spaces:
Sleeping
Sleeping
MohammedNasser
commited on
Commit
•
d800d23
1
Parent(s):
345a26b
Update app.py
Browse files
app.py
CHANGED
@@ -37,63 +37,31 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
|
|
37 |
if not os.path.exists(folder):
|
38 |
os.makedirs(folder)
|
39 |
|
40 |
-
|
41 |
def load_pdf(file_path):
|
42 |
"""Load and preprocess Arabic text from a PDF file."""
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
44 |
documents = []
|
45 |
for pageNum, imgBlob in enumerate(pages):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
from langchain.vectorstores import FAISS
|
53 |
-
from huggingface_hub import Repository
|
54 |
-
|
55 |
-
def save_faiss_index_to_hub(vectorstore, repo_id="MohammedNasser/faiss-index"):
|
56 |
-
index_dir = "faiss_index"
|
57 |
-
|
58 |
-
# Ensure the index directory exists
|
59 |
-
if not os.path.exists(index_dir):
|
60 |
-
os.makedirs(index_dir)
|
61 |
-
|
62 |
-
# Save FAISS index locally
|
63 |
-
vectorstore.save_local(index_dir)
|
64 |
-
|
65 |
-
# Initialize Hugging Face repository
|
66 |
-
repo = Repository(local_dir=index_dir, clone_from=repo_id, repo_type="dataset")
|
67 |
-
|
68 |
-
# Push the FAISS index files to the Hugging Face Hub
|
69 |
-
repo.push_to_hub(commit_message="Pushing FAISS index")
|
70 |
-
|
71 |
-
print(f"FAISS index saved to Hugging Face Hub: {repo_id}")
|
72 |
|
73 |
def prepare_vectorstore(data):
|
74 |
-
index_dir = "faiss_index"
|
75 |
-
if not os.path.exists(index_dir):
|
76 |
-
os.makedirs(index_dir)
|
77 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
|
78 |
-
texts = data
|
79 |
vectorstore = FAISS.from_texts(texts, embeddings)
|
80 |
-
save_faiss_index_to_hub(vectorstore)
|
81 |
-
return vectorstore
|
82 |
-
|
83 |
-
|
84 |
-
def load_vectorstore(repo_id="MohammedNasser/faiss-index"):
|
85 |
-
index_dir = "faiss_index"
|
86 |
|
87 |
-
# Ensure the index directory exists
|
88 |
-
if not os.path.exists(index_dir):
|
89 |
-
os.makedirs(index_dir)
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
# Download the FAISS index files from Hugging Face Hub
|
94 |
-
hf_hub_download(repo_id=repo_id, filename="index.faiss", local_dir=index_dir, repo_type="dataset")
|
95 |
-
hf_hub_download(repo_id=repo_id, filename="index.json", local_dir=index_dir, repo_type="dataset")
|
96 |
-
vectorstore = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)
|
97 |
return vectorstore
|
98 |
|
99 |
def create_chain(vectorstore):
|
@@ -108,41 +76,56 @@ def create_chain(vectorstore):
|
|
108 |
chain_type="map_reduce"
|
109 |
)
|
110 |
return chain
|
111 |
-
|
112 |
def process_pdf(pdf_file):
|
|
|
113 |
file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
119 |
|
120 |
def chat(user_input, history):
|
121 |
-
vectorstore
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
custom_css = """
|
147 |
body {
|
148 |
font-family: 'Noto Kufi Arabic', sans-serif;
|
@@ -216,6 +199,7 @@ p {
|
|
216 |
content: '🤖';
|
217 |
}
|
218 |
"""
|
|
|
219 |
# Gradio interface
|
220 |
with gr.Blocks(css=custom_css) as demo:
|
221 |
gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
|
@@ -251,4 +235,3 @@ with gr.Blocks(css=custom_css) as demo:
|
|
251 |
|
252 |
demo.launch()
|
253 |
|
254 |
-
|
|
|
37 |
if not os.path.exists(folder):
|
38 |
os.makedirs(folder)
|
39 |
|
40 |
+
vectorstore=None
|
41 |
def load_pdf(file_path):
|
42 |
"""Load and preprocess Arabic text from a PDF file."""
|
43 |
+
try:
|
44 |
+
pages = convert_from_path(file_path, 500)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error loading PDF: {e}")
|
47 |
+
return []
|
48 |
+
|
49 |
documents = []
|
50 |
for pageNum, imgBlob in enumerate(pages):
|
51 |
+
try:
|
52 |
+
text = pytesseract.image_to_string(imgBlob, lang="ara")
|
53 |
+
documents.append(text)
|
54 |
+
except Exception as e:
|
55 |
+
print(f"Error processing page {pageNum}: {e}")
|
56 |
+
documents.append("") # Append empty string for pages where OCR failed
|
57 |
|
58 |
+
return documents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def prepare_vectorstore(data):
|
|
|
|
|
|
|
61 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
|
62 |
+
texts = text_splitter.split_documents(data)
|
63 |
vectorstore = FAISS.from_texts(texts, embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return vectorstore
|
66 |
|
67 |
def create_chain(vectorstore):
|
|
|
76 |
chain_type="map_reduce"
|
77 |
)
|
78 |
return chain
|
79 |
+
|
80 |
def process_pdf(pdf_file):
|
81 |
+
global vectorstore
|
82 |
file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
|
83 |
+
try:
|
84 |
+
with open(file_path, "wb") as f:
|
85 |
+
f.write(pdf_file.read())
|
86 |
+
data = load_pdf(file_path)
|
87 |
+
vectorstore = prepare_vectorstore(data)
|
88 |
+
return "PDF processed successfully. You can now start chatting!"
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error processing PDF: {e}")
|
91 |
+
return "Error processing PDF."
|
92 |
|
93 |
def chat(user_input, history):
|
94 |
+
if vectorstore is None:
|
95 |
+
return "Please process a PDF file first.", ""
|
96 |
+
|
97 |
+
try:
|
98 |
+
chain = create_chain(vectorstore)
|
99 |
+
prompt = f"""
|
100 |
+
You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
|
101 |
+
|
102 |
+
When responding, ensure the following:
|
103 |
+
- Your answer directly reflects the content of the document.
|
104 |
+
- If the requested information is not available in the document, clearly state that.
|
105 |
+
- Keep your response concise yet comprehensive, addressing the question fully.
|
106 |
+
- Always respond in formal Arabic, without using English.
|
107 |
+
|
108 |
+
Question: {user_input}
|
109 |
+
Helpful Answer:"""
|
110 |
+
|
111 |
+
response = chain({"question": prompt})
|
112 |
+
assistant_response = response["answer"]
|
113 |
+
|
114 |
+
# Generate audio file
|
115 |
+
audio_file = f"response_{len(history)}.mp3"
|
116 |
+
try:
|
117 |
+
tts = gTTS(text=assistant_response, lang='ar')
|
118 |
+
tts.save(os.path.join(AUDIO_FOLDER, audio_file))
|
119 |
+
except Exception as e:
|
120 |
+
print(f"Error generating audio file: {e}")
|
121 |
+
audio_file = "" # Fallback if audio generation fails
|
122 |
+
|
123 |
+
return assistant_response, audio_file
|
124 |
+
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Error during chat: {e}")
|
127 |
+
return "An error occurred while processing your request.", ""
|
128 |
+
|
129 |
custom_css = """
|
130 |
body {
|
131 |
font-family: 'Noto Kufi Arabic', sans-serif;
|
|
|
199 |
content: '🤖';
|
200 |
}
|
201 |
"""
|
202 |
+
|
203 |
# Gradio interface
|
204 |
with gr.Blocks(css=custom_css) as demo:
|
205 |
gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
|
|
|
235 |
|
236 |
demo.launch()
|
237 |
|
|