import os import tempfile import gradio as gr from PIL import Image from pdf2image import convert_from_path import pytesseract from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.memory import ConversationBufferMemory from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from langchain_groq import ChatGroq class ChatbotModel: def __init__(self): # Initialize the environment variable for the GROQ API Key os.environ["GROQ_API_KEY"] = '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' # Initialize embeddings self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Initialize the chat model self.llm = ChatGroq( model='llama3-70b-8192', temperature=0.5, max_tokens=None, timeout=None, max_retries=2, ) # Initialize memory for conversation self.memory = ConversationBufferMemory(memory_key="history", input_key="question") # Create the QA chain prompt template self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information. Core Responsibilities: 1. Language Processing: - Identify the language of the user's query (English or Gujarati) - Respond in the same language as the query - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology - For technical terms, provide both English and Gujarati versions when relevant 2. Document Understanding: - Analyze the OCR-processed text from the uploaded {document_type} - Account for potential OCR errors or misinterpretations - Focus on extracting accurate information despite possible OCR imperfections 3. Response Guidelines: - Provide direct, clear answers based solely on the document content - If information is unclear due to OCR quality, mention this limitation - For numerical data (dates, percentages, marks), double-check accuracy before responding - If information is not found in the document, clearly state: "This information is not present in the uploaded document" 4. Educational Context: - Maintain focus on educational queries related to the document content - For admission-related queries, emphasize important deadlines and requirements - For scholarship information, highlight eligibility criteria and application processes - For course-related queries, provide detailed, accurate information from the document 5. Response Format: - Structure responses clearly with relevant subpoints when necessary - For complex information, break down the answer into digestible parts - Include relevant reference points from the document when applicable - Format numerical data and dates clearly 6. Quality Control: - Verify that responses align with the document content - Don't make assumptions beyond the provided information - If multiple interpretations are possible due to OCR quality, mention all possibilities - Maintain consistency in terminology throughout the conversation Important Rules: - Never make up information not present in the document - Don't combine information from previous conversations or external knowledge - Always indicate if certain parts of the document are unclear due to OCR quality - Maintain professional tone while being accessible to students and parents - If the query is out of scope of the uploaded document, politely redirect to relevant official sources Context from uploaded document: {context} Chat History: {history} Current Question: {question} Assistant: Let me provide a clear and accurate response based on the uploaded document content... """ self.QA_CHAIN_PROMPT = PromptTemplate( input_variables=["history", "context", "question"], template=self.template ) self.db1 = None self.qa_chain = None def ocr_image(self, image_path, language='eng+guj'): img = Image.open(image_path) return pytesseract.image_to_string(img, lang=language) def ocr_pdf(self, pdf_path, language='eng+guj'): images = convert_from_path(pdf_path) return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images]) def process_file(self, uploaded_file): """Process an uploaded file and initialize the QA chain.""" _, file_extension = os.path.splitext(uploaded_file.name) file_extension = file_extension.lower() # Temporarily save the file for processing with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: temp_file.write(uploaded_file.read()) temp_path = temp_file.name # OCR processing based on file type if file_extension == '.pdf': raw_text = self.ocr_pdf(temp_path, language='guj+eng') elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: raw_text = self.ocr_image(temp_path, language='guj+eng') else: return "Unsupported file format." # Split text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) text_chunks = text_splitter.split_text(raw_text) # Create vector store and initialize QA chain self.db1 = FAISS.from_documents(text_chunks, self.embeddings) self.qa_chain = RetrievalQA.from_chain_type( self.llm, retriever=self.db1.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={ "verbose": True, "prompt": self.QA_CHAIN_PROMPT, "memory": self.memory } ) return "File processed successfully!" def get_response(self, user_input): """Generate response to the user input question.""" if not self.qa_chain: return "Please upload and process a file before asking questions." response = self.qa_chain({"query": user_input}) return response["result"] # Initialize the chatbot chatbot = ChatbotModel() # Define Gradio interface functions def upload_and_process(file): return chatbot.process_file(file) def ask_question(question): return chatbot.get_response(question) # Set up Gradio interface interface = gr.Blocks() with interface: gr.Markdown("# Educational Chatbot with Document Analysis") with gr.Row(): file_upload = gr.File(label="Upload PDF or Image") upload_btn = gr.Button("Process File") output = gr.Textbox(label="File Processing Status") with gr.Row(): question_box = gr.Textbox(label="Ask a Question") ask_btn = gr.Button("Submit") answer = gr.Textbox(label="Answer") # Connect buttons to functions upload_btn.click(upload_and_process, inputs=file_upload, outputs=output) ask_btn.click(ask_question, inputs=question_box, outputs=answer) # Launch Gradio interface interface.launch()