Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import gradio as gr | |
from PIL import Image | |
from pdf2image import convert_from_path | |
import pytesseract | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.memory import ConversationBufferMemory | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import RetrievalQA | |
from langchain_groq import ChatGroq | |
class ChatbotModel: | |
def __init__(self): | |
# Initialize the environment variable for the GROQ API Key | |
os.environ["GROQ_API_KEY"] = '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' | |
# Initialize embeddings | |
self.embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs={'normalize_embeddings': True} | |
) | |
# Initialize the chat model | |
self.llm = ChatGroq( | |
model='llama3-70b-8192', | |
temperature=0.5, | |
max_tokens=None, | |
timeout=None, | |
max_retries=2, | |
) | |
# Initialize memory for conversation | |
self.memory = ConversationBufferMemory(memory_key="history", input_key="question") | |
# Create the QA chain prompt template | |
self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information. | |
Core Responsibilities: | |
1. Language Processing: | |
- Identify the language of the user's query (English or Gujarati) | |
- Respond in the same language as the query | |
- If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology | |
- For technical terms, provide both English and Gujarati versions when relevant | |
2. Document Understanding: | |
- Analyze the OCR-processed text from the uploaded {document_type} | |
- Account for potential OCR errors or misinterpretations | |
- Focus on extracting accurate information despite possible OCR imperfections | |
3. Response Guidelines: | |
- Provide direct, clear answers based solely on the document content | |
- If information is unclear due to OCR quality, mention this limitation | |
- For numerical data (dates, percentages, marks), double-check accuracy before responding | |
- If information is not found in the document, clearly state: "This information is not present in the uploaded document" | |
4. Educational Context: | |
- Maintain focus on educational queries related to the document content | |
- For admission-related queries, emphasize important deadlines and requirements | |
- For scholarship information, highlight eligibility criteria and application processes | |
- For course-related queries, provide detailed, accurate information from the document | |
5. Response Format: | |
- Structure responses clearly with relevant subpoints when necessary | |
- For complex information, break down the answer into digestible parts | |
- Include relevant reference points from the document when applicable | |
- Format numerical data and dates clearly | |
6. Quality Control: | |
- Verify that responses align with the document content | |
- Don't make assumptions beyond the provided information | |
- If multiple interpretations are possible due to OCR quality, mention all possibilities | |
- Maintain consistency in terminology throughout the conversation | |
Important Rules: | |
- Never make up information not present in the document | |
- Don't combine information from previous conversations or external knowledge | |
- Always indicate if certain parts of the document are unclear due to OCR quality | |
- Maintain professional tone while being accessible to students and parents | |
- If the query is out of scope of the uploaded document, politely redirect to relevant official sources | |
Context from uploaded document: | |
{context} | |
Chat History: | |
{history} | |
Current Question: {question} | |
Assistant: Let me provide a clear and accurate response based on the uploaded document content... | |
""" | |
self.QA_CHAIN_PROMPT = PromptTemplate( | |
input_variables=["history", "context", "question"], | |
template=self.template | |
) | |
self.db1 = None | |
self.qa_chain = None | |
def ocr_image(self, image_path, language='eng+guj'): | |
img = Image.open(image_path) | |
return pytesseract.image_to_string(img, lang=language) | |
def ocr_pdf(self, pdf_path, language='eng+guj'): | |
images = convert_from_path(pdf_path) | |
return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images]) | |
def process_file(self, uploaded_file): | |
"""Process an uploaded file and initialize the QA chain.""" | |
_, file_extension = os.path.splitext(uploaded_file.name) | |
file_extension = file_extension.lower() | |
# Temporarily save the file for processing | |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: | |
temp_file.write(uploaded_file.read()) | |
temp_path = temp_file.name | |
# OCR processing based on file type | |
if file_extension == '.pdf': | |
raw_text = self.ocr_pdf(temp_path, language='guj+eng') | |
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: | |
raw_text = self.ocr_image(temp_path, language='guj+eng') | |
else: | |
return "Unsupported file format." | |
# Split text into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
text_chunks = text_splitter.split_text(raw_text) | |
# Create vector store and initialize QA chain | |
self.db1 = FAISS.from_documents(text_chunks, self.embeddings) | |
self.qa_chain = RetrievalQA.from_chain_type( | |
self.llm, | |
retriever=self.db1.as_retriever(), | |
chain_type='stuff', | |
verbose=True, | |
chain_type_kwargs={ | |
"verbose": True, | |
"prompt": self.QA_CHAIN_PROMPT, | |
"memory": self.memory | |
} | |
) | |
return "File processed successfully!" | |
def get_response(self, user_input): | |
"""Generate response to the user input question.""" | |
if not self.qa_chain: | |
return "Please upload and process a file before asking questions." | |
response = self.qa_chain({"query": user_input}) | |
return response["result"] | |
# Initialize the chatbot | |
chatbot = ChatbotModel() | |
# Define Gradio interface functions | |
def upload_and_process(file): | |
return chatbot.process_file(file) | |
def ask_question(question): | |
return chatbot.get_response(question) | |
# Set up Gradio interface | |
interface = gr.Blocks() | |
with interface: | |
gr.Markdown("# Educational Chatbot with Document Analysis") | |
with gr.Row(): | |
file_upload = gr.File(label="Upload PDF or Image") | |
upload_btn = gr.Button("Process File") | |
output = gr.Textbox(label="File Processing Status") | |
with gr.Row(): | |
question_box = gr.Textbox(label="Ask a Question") | |
ask_btn = gr.Button("Submit") | |
answer = gr.Textbox(label="Answer") | |
# Connect buttons to functions | |
upload_btn.click(upload_and_process, inputs=file_upload, outputs=output) | |
ask_btn.click(ask_question, inputs=question_box, outputs=answer) | |
# Launch Gradio interface | |
interface.launch() | |