Spaces:

KrishP-12
/

docacpc

Sleeping

File size: 7,819 Bytes

import os
import tempfile
import gradio as gr
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

class ChatbotModel:
    def __init__(self):
        # Initialize the environment variable for the GROQ API Key
        os.environ["GROQ_API_KEY"] = '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'

        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )

        # Initialize the chat model
        self.llm = ChatGroq(
            model='llama3-70b-8192',
            temperature=0.5,
            max_tokens=None,
            timeout=None,
            max_retries=2,
        )

        # Initialize memory for conversation
        self.memory = ConversationBufferMemory(memory_key="history", input_key="question")

        # Create the QA chain prompt template
        self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information.

        Core Responsibilities:
        1. Language Processing:
           - Identify the language of the user's query (English or Gujarati)
           - Respond in the same language as the query
           - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
           - For technical terms, provide both English and Gujarati versions when relevant

        2. Document Understanding:
           - Analyze the OCR-processed text from the uploaded {document_type}
           - Account for potential OCR errors or misinterpretations
           - Focus on extracting accurate information despite possible OCR imperfections

        3. Response Guidelines:
           - Provide direct, clear answers based solely on the document content
           - If information is unclear due to OCR quality, mention this limitation
           - For numerical data (dates, percentages, marks), double-check accuracy before responding
           - If information is not found in the document, clearly state: "This information is not present in the uploaded document"

        4. Educational Context:
           - Maintain focus on educational queries related to the document content
           - For admission-related queries, emphasize important deadlines and requirements
           - For scholarship information, highlight eligibility criteria and application processes
           - For course-related queries, provide detailed, accurate information from the document

        5. Response Format:
           - Structure responses clearly with relevant subpoints when necessary
           - For complex information, break down the answer into digestible parts
           - Include relevant reference points from the document when applicable
           - Format numerical data and dates clearly

        6. Quality Control:
           - Verify that responses align with the document content
           - Don't make assumptions beyond the provided information
           - If multiple interpretations are possible due to OCR quality, mention all possibilities
           - Maintain consistency in terminology throughout the conversation

        Important Rules:
        - Never make up information not present in the document
        - Don't combine information from previous conversations or external knowledge
        - Always indicate if certain parts of the document are unclear due to OCR quality
        - Maintain professional tone while being accessible to students and parents
        - If the query is out of scope of the uploaded document, politely redirect to relevant official sources

        Context from uploaded document:
        {context}

        Chat History:
        {history}

        Current Question: {question}
        Assistant: Let me provide a clear and accurate response based on the uploaded document content...
        """

        self.QA_CHAIN_PROMPT = PromptTemplate(
            input_variables=["history", "context", "question"],
            template=self.template
        )
        
        self.db1 = None
        self.qa_chain = None

    def ocr_image(self, image_path, language='eng+guj'):
        img = Image.open(image_path)
        return pytesseract.image_to_string(img, lang=language)

    def ocr_pdf(self, pdf_path, language='eng+guj'):
        images = convert_from_path(pdf_path)
        return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])

    def process_file(self, uploaded_file):
        """Process an uploaded file and initialize the QA chain."""
        _, file_extension = os.path.splitext(uploaded_file.name)
        file_extension = file_extension.lower()

        # Temporarily save the file for processing
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
            temp_file.write(uploaded_file.read())
            temp_path = temp_file.name

        # OCR processing based on file type
        if file_extension == '.pdf':
            raw_text = self.ocr_pdf(temp_path, language='guj+eng')
        elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
            raw_text = self.ocr_image(temp_path, language='guj+eng')
        else:
            return "Unsupported file format."

        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        text_chunks = text_splitter.split_text(raw_text)

        # Create vector store and initialize QA chain
        self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
        self.qa_chain = RetrievalQA.from_chain_type(
            self.llm,
            retriever=self.db1.as_retriever(),
            chain_type='stuff',
            verbose=True,
            chain_type_kwargs={
                "verbose": True,
                "prompt": self.QA_CHAIN_PROMPT,
                "memory": self.memory
            }
        )
        return "File processed successfully!"

    def get_response(self, user_input):
        """Generate response to the user input question."""
        if not self.qa_chain:
            return "Please upload and process a file before asking questions."
        response = self.qa_chain({"query": user_input})
        return response["result"]

# Initialize the chatbot
chatbot = ChatbotModel()

# Define Gradio interface functions
def upload_and_process(file):
    return chatbot.process_file(file)

def ask_question(question):
    return chatbot.get_response(question)

# Set up Gradio interface
interface = gr.Blocks()

with interface:
    gr.Markdown("# Educational Chatbot with Document Analysis")
    with gr.Row():
        file_upload = gr.File(label="Upload PDF or Image")
        upload_btn = gr.Button("Process File")
    output = gr.Textbox(label="File Processing Status")

    with gr.Row():
        question_box = gr.Textbox(label="Ask a Question")
        ask_btn = gr.Button("Submit")
    answer = gr.Textbox(label="Answer")

    # Connect buttons to functions
    upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
    ask_btn.click(ask_question, inputs=question_box, outputs=answer)

# Launch Gradio interface
interface.launch()