sairamn commited on
Commit
9a2a9f6
·
1 Parent(s): c840d95

Added main file

Browse files
Files changed (2) hide show
  1. app.py +386 -0
  2. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pdfplumber
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from transformers import AutoTokenizer, AutoModel
6
+ import torch
7
+ from groq import Groq
8
+ import os
9
+ from fpdf import FPDF
10
+ import PyPDF2
11
+ from dotenv import load_dotenv
12
+ import pickle
13
+ from langchain_community.vectorstores import FAISS
14
+ from langchain_community.embeddings import HuggingFaceEmbeddings
15
+ from langchain.prompts import PromptTemplate
16
+ from langchain_together import Together
17
+
18
+ load_dotenv()
19
+
20
+ def extract_text_from_pdf(pdf_path):
21
+ try:
22
+ with pdfplumber.open(pdf_path) as pdf:
23
+ text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
24
+ return text
25
+ except Exception as e:
26
+ return f"Error extracting text: {str(e)}"
27
+
28
+ def get_huggingface_embeddings(text_chunks):
29
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ model = AutoModel.from_pretrained(model_name)
32
+
33
+ inputs = tokenizer(text_chunks, return_tensors="pt", padding=True, truncation=True)
34
+ with torch.no_grad():
35
+ outputs = model(**inputs)
36
+
37
+ embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
38
+ return embeddings
39
+
40
+ def compute_similarity(embeddings1, embeddings2):
41
+ return cosine_similarity(embeddings1, embeddings2)
42
+
43
+ def generate_comparison_summary(similarity_score, similar_terms):
44
+ summary = f"Similarity Score: {similarity_score:.2f}%\n"
45
+ summary += "Important terms/phrases:\n"
46
+ summary += "\n".join(similar_terms)
47
+ summary += "\nThis comparison highlights the key related points between the documents."
48
+ return summary
49
+
50
+ def generate_pdf_report(similarity_score, similar_terms):
51
+ pdf = FPDF()
52
+ pdf.set_auto_page_break(auto=True, margin=15)
53
+ pdf.add_page()
54
+ pdf.set_font("Arial", style='B', size=16)
55
+ pdf.cell(200, 10, "PDF Similarity Report", ln=True, align='C')
56
+ pdf.ln(10)
57
+
58
+ pdf.set_font("Arial", size=12)
59
+ pdf.multi_cell(0, 10, f"Similarity Score: {similarity_score:.2f}%\n\n")
60
+ pdf.multi_cell(0, 10, "Important terms/phrases:")
61
+
62
+ for term in similar_terms:
63
+ pdf.multi_cell(0, 8, f"- {term}")
64
+
65
+ pdf.multi_cell(0, 10, "\nThis comparison highlights the key related points between the documents.")
66
+ pdf_path = "similarity_report.pdf"
67
+ pdf.output(pdf_path)
68
+ return pdf_path
69
+
70
+ def compare_documents(file1, file2):
71
+ try:
72
+ file1_path = file1.name
73
+ file2_path = file2.name
74
+
75
+ text1 = extract_text_from_pdf(file1_path)
76
+ text2 = extract_text_from_pdf(file2_path)
77
+
78
+ chunks1 = text1.split(". ")
79
+ chunks2 = text2.split(". ")
80
+
81
+ embeddings1 = get_huggingface_embeddings(chunks1)
82
+ embeddings2 = get_huggingface_embeddings(chunks2)
83
+
84
+ similarity_scores = compute_similarity(embeddings1, embeddings2)
85
+ overall_similarity = np.mean(similarity_scores) * 100
86
+
87
+ groq_api_key = os.getenv("API_KEY")
88
+ if groq_api_key:
89
+ client = Groq(api_key=groq_api_key)
90
+ response = client.chat.completions.create(
91
+ model="llama-3.3-70b-versatile",
92
+ messages=[
93
+ {"role": "system",
94
+ "content": "You are a helpful assistant. Only give required and should not give outside context and dont give groq context or anything like that. Stick to work."},
95
+ {"role": "user",
96
+ "content": f"Here is the text of two documents. Extract only the important terms or phrases from both, filtering out common terms like 'court', 'judge', etc.\n\nDocument 1:\n{text1}\n\nDocument 2:\n{text2}"}
97
+ ]
98
+ )
99
+ groq_response = response.choices[0].message.content
100
+ similar_terms = groq_response.split('\n')
101
+ else:
102
+ groq_response = "Groq API key not found. Skipping further analysis."
103
+ similar_terms = ["Error retrieving important terms."]
104
+
105
+ comparison_summary = generate_comparison_summary(overall_similarity, similar_terms)
106
+ pdf_report_path = generate_pdf_report(overall_similarity, similar_terms)
107
+
108
+ return comparison_summary, pdf_report_path
109
+
110
+ except Exception as e:
111
+ return f"Error: {str(e)}", None
112
+
113
+ def extract_text_from_pdf_translator(file):
114
+ reader = PyPDF2.PdfReader(file)
115
+ text = ""
116
+ for page in reader.pages:
117
+ text += page.extract_text()
118
+ return text
119
+
120
+ def split_text_into_chunks(text, chunk_size=1000):
121
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
122
+ return chunks
123
+
124
+ def translate_text_with_groq(text, target_language):
125
+ api_key = os.getenv("API_KEY")
126
+ if not api_key:
127
+ raise ValueError("API key is missing. Please set the 'API_KEY' environment variable.")
128
+
129
+ client = Groq(api_key=api_key)
130
+ prompt_text = f"Translate the following text to {target_language}:\n{text}\n\nConvert to {target_language}"
131
+
132
+ try:
133
+ completion = client.chat.completions.create(
134
+ model="llama-3.3-70b-versatile",
135
+ messages=[{
136
+ "role": "user",
137
+ "content": prompt_text
138
+ }],
139
+ temperature=1,
140
+ max_tokens=1024,
141
+ top_p=1,
142
+ stream=True,
143
+ stop=None,
144
+ )
145
+
146
+ translated_text = ""
147
+ for chunk in completion:
148
+ translated_text += chunk.choices[0].delta.content or ""
149
+
150
+ return translated_text
151
+ except Exception as e:
152
+ return f"Translation error: {str(e)}. Please try again later."
153
+
154
+ def get_font_for_language(language):
155
+ font_map = {
156
+ "Tamil": "NotoSansTamil-Regular.ttf",
157
+ "Telugu": "NotoSansTelugu-Regular.ttf",
158
+ "Malayalam": "NotoSansMalayalam-Regular.ttf",
159
+ "Hindi": "NotoSansDevanagari-Regular.ttf"
160
+ }
161
+ return font_map.get(language, "DejaVuSans.ttf")
162
+
163
+ def save_translated_text_as_pdf(translated_text, target_language, output_filename="translated_text.pdf"):
164
+ pdf = FPDF()
165
+ pdf.add_page()
166
+
167
+ font_path = get_font_for_language(target_language)
168
+
169
+ try:
170
+ pdf.add_font("CustomFont", "", font_path, uni=True)
171
+ pdf.set_font("CustomFont", size=12)
172
+ except Exception as e:
173
+ return f"Error loading font: {e}"
174
+
175
+ pdf.set_auto_page_break(auto=True, margin=15)
176
+ pdf.set_left_margin(15)
177
+ pdf.set_right_margin(15)
178
+
179
+ width = pdf.w - 2 * pdf.l_margin
180
+ pdf.multi_cell(width, 10, translated_text, align='L')
181
+
182
+ pdf.output(output_filename)
183
+ return output_filename
184
+
185
+ def process_pdf_and_translate(file, target_language):
186
+ text = extract_text_from_pdf_translator(file)
187
+ chunks = split_text_into_chunks(text)
188
+
189
+ translated_text = ""
190
+ for chunk in chunks:
191
+ translated_text += translate_text_with_groq(chunk, target_language)
192
+
193
+ pdf_file = save_translated_text_as_pdf(translated_text, target_language)
194
+ return translated_text, pdf_file
195
+
196
+ def extract_text_from_pdf_summarizer(file):
197
+ reader = PyPDF2.PdfReader(file)
198
+ text = ""
199
+ for page in reader.pages:
200
+ text += page.extract_text()
201
+ return text
202
+
203
+ def trim_text_for_groq(text, limit=3000):
204
+ if len(text) > 2 * limit:
205
+ return text[:limit] + '\n...\n' + text[-limit:]
206
+ return text
207
+
208
+ def summarize_text_with_groq(text):
209
+ api_key = os.getenv("API_KEY")
210
+ if not api_key:
211
+ raise ValueError("API key is missing. Please set the 'API_KEY' environment variable.")
212
+
213
+ client = Groq(api_key=api_key)
214
+ completion = client.chat.completions.create(
215
+ model="llama-3.3-70b-versatile",
216
+ messages=[
217
+ {
218
+ "role": "user",
219
+ "content": text
220
+ }
221
+ ],
222
+ temperature=1,
223
+ max_tokens=1024,
224
+ top_p=1,
225
+ stream=True,
226
+ stop=None,
227
+ )
228
+
229
+ summary = ""
230
+ for chunk in completion:
231
+ summary += chunk.choices[0].delta.content or ""
232
+ return summary
233
+
234
+ def save_summary_as_pdf(summary, output_filename="summary.pdf"):
235
+ pdf = FPDF()
236
+ pdf.add_page()
237
+ pdf.set_auto_page_break(auto=True, margin=15)
238
+ pdf.set_font("Arial", size=12)
239
+ pdf.multi_cell(0, 10, summary)
240
+ pdf.output(output_filename)
241
+ return output_filename
242
+
243
+ def process_pdf_and_summarize(file):
244
+ text = extract_text_from_pdf_summarizer(file)
245
+ trimmed_text = trim_text_for_groq(text)
246
+ summary = summarize_text_with_groq(trimmed_text)
247
+ pdf_file = save_summary_as_pdf(summary)
248
+ return summary, pdf_file
249
+
250
+ def load_embeddings():
251
+ try:
252
+ embeddings = HuggingFaceEmbeddings(
253
+ model_name="nomic-ai/nomic-embed-text-v1",
254
+ model_kwargs={"trust_remote_code": True, "revision": "289f532e14dbbbd5a04753fa58739e9ba766f3c7"}
255
+ )
256
+ print("Embeddings loaded successfully.")
257
+ return embeddings
258
+ except Exception as e:
259
+ raise RuntimeError(f"Error loading embeddings: {e}")
260
+
261
+ embeddings = load_embeddings()
262
+
263
+ def load_db():
264
+ try:
265
+ db = FAISS.load_local("law_vector_db", embeddings, allow_dangerous_deserialization=True)
266
+ print(f"FAISS index loaded successfully.")
267
+
268
+ with open('law_vector_db/index.pkl', 'rb') as pkl_file:
269
+ metadata = pickle.load(pkl_file)
270
+ print("Pickle file loaded successfully.")
271
+
272
+ return db, metadata
273
+ except Exception as e:
274
+ raise RuntimeError(f"Error loading FAISS index or pickle file: {e}")
275
+
276
+ db, metadata = load_db()
277
+ db_retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
278
+
279
+ prompt_template = """
280
+ <s>[INST]This is a chat template and As a legal chatbot specializing in Indian Penal Code queries, your primary objective is to provide accurate and concise information based on the user's questions.
281
+ Do not generate your own questions and answers. You will adhere strictly to the instructions provided, offering relevant context from the knowledge base while avoiding unnecessary details.
282
+ Your responses will be brief, to the point, and in compliance with the established format.
283
+ If a question falls outside the given context, you will refrain from utilizing the chat history and instead rely on your own knowledge base to generate an appropriate response.
284
+ You will prioritize the user's query and refrain from posing additional questions.
285
+ The aim is to deliver professional, precise, and contextually relevant information pertaining to the Indian Penal Code.
286
+ CONTEXT: {context}
287
+ CHAT HISTORY: {chat_history}
288
+ QUESTION: {question}
289
+ ANSWER:</s>[INST]
290
+ """
291
+
292
+ prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question', 'chat_history'])
293
+
294
+ TOGETHER_AI_API = "66bd7a6dc11956ddb311b773c0deabda8870e8c90e9f548ce064880ac47c4b05"
295
+
296
+ llm = Together(
297
+ model="mistralai/Mistral-7B-Instruct-v0.2",
298
+ temperature=0.5,
299
+ max_tokens=1024,
300
+ together_api_key=TOGETHER_AI_API
301
+ )
302
+
303
+ def ask_question(user_question, chat_history=[]):
304
+ try:
305
+ context_docs = db_retriever.get_relevant_documents(user_question)
306
+ context = "\n".join(
307
+ [doc.page_content for doc in context_docs]) if context_docs else "No relevant context found."
308
+
309
+ input_data = {
310
+ "context": context,
311
+ "question": user_question,
312
+ "chat_history": "\n".join(chat_history)
313
+ }
314
+
315
+ response = llm(prompt.format(**input_data))
316
+ return response
317
+ except Exception as e:
318
+ return f"Error: {e}"
319
+
320
+ def chat_bot_interface(user_message, chat_history=[]):
321
+ if not user_message:
322
+ return chat_history, chat_history
323
+
324
+ chat_history.append(("User", user_message))
325
+ response = ask_question(user_message, [msg[1] for msg in chat_history if msg[0] == "User"])
326
+ chat_history.append(("Assistant", response))
327
+
328
+ return chat_history, chat_history
329
+
330
+ with gr.Blocks() as demo:
331
+ with gr.Tabs():
332
+ with gr.Tab("PDF Similarity Checker"):
333
+ gr.Markdown("## PDF Similarity Checker")
334
+ with gr.Row():
335
+ with gr.Column():
336
+ file1 = gr.File(label="Upload PDF 1")
337
+ file2 = gr.File(label="Upload PDF 2")
338
+ compare_button = gr.Button("Compare")
339
+ with gr.Column():
340
+ result = gr.Textbox(label="Results")
341
+ report = gr.File(label="Download Report")
342
+ compare_button.click(compare_documents, inputs=[file1, file2], outputs=[result, report])
343
+
344
+ with gr.Tab("Translator"):
345
+ gr.Markdown("## Document Translation using Groq")
346
+ with gr.Row():
347
+ with gr.Column():
348
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
349
+ language_input = gr.Dropdown(
350
+ label="Select Target Language",
351
+ choices=["Tamil", "Malayalam", "Telugu", "Hindi"],
352
+ value="Tamil"
353
+ )
354
+ translate_button = gr.Button("Translate")
355
+ with gr.Column():
356
+ translated_output = gr.Textbox(label="Translated Text", lines=10, interactive=False)
357
+ download_link = gr.File(label="Download Translated PDF", interactive=False)
358
+ translate_button.click(process_pdf_and_translate, inputs=[pdf_input, language_input], outputs=[translated_output, download_link])
359
+
360
+ with gr.Tab("Summarizer"):
361
+ gr.Markdown("## PDF Summarizer")
362
+ with gr.Row():
363
+ with gr.Column():
364
+ pdf_input_summary = gr.File(label="Upload PDF", file_types=[".pdf"])
365
+ summarize_button = gr.Button("Summarize")
366
+ with gr.Column():
367
+ summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)
368
+ download_link_summary = gr.File(label="Download Summary as PDF", interactive=False)
369
+ summarize_button.click(process_pdf_and_summarize, inputs=[pdf_input_summary], outputs=[summary_output, download_link_summary])
370
+
371
+ with gr.Tab("Law Chatbot"):
372
+ gr.Markdown("<h1 style='text-align: center;'>Legal Chatbot</h1>")
373
+ chatbot = gr.Chatbot(label="Chatbot Interface")
374
+ user_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...", lines=1)
375
+ clear_button = gr.Button("Clear")
376
+ chat_history = gr.State([])
377
+
378
+ def clear_chat():
379
+ return [], []
380
+
381
+ user_input.submit(chat_bot_interface, inputs=[user_input, chat_history], outputs=[chatbot, chat_history])
382
+ clear_button.click(clear_chat, outputs=[chatbot, chat_history])
383
+
384
+ # Launch the app
385
+ if __name__ == "__main__":
386
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.50.2
2
+ numpy==1.26.4
3
+ pdfplumber==0.10.3
4
+ scikit-learn==1.4.0
5
+ transformers==4.38.2
6
+ torch==2.2.1
7
+ groq==0.3.0
8
+ fpdf==1.7.2
9
+ PyPDF2==3.0.1
10
+ python-dotenv==1.0.1
11
+ langchain==0.1.12
12
+ langchain-community==0.0.28
13
+ langchain-together==0.1.0
14
+ faiss-cpu==1.7.4
15
+ sentence-transformers==2.5.1
16
+ einops==0.7.0
17
+ huggingface-hub==0.21.4