Spaces:
Sleeping
Sleeping
File size: 7,819 Bytes
49ce73b f365e2e 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b 2de3e63 49ce73b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import os
import tempfile
import gradio as gr
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
class ChatbotModel:
def __init__(self):
# Initialize the environment variable for the GROQ API Key
os.environ["GROQ_API_KEY"] = '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'
# Initialize embeddings
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
# Initialize the chat model
self.llm = ChatGroq(
model='llama3-70b-8192',
temperature=0.5,
max_tokens=None,
timeout=None,
max_retries=2,
)
# Initialize memory for conversation
self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
# Create the QA chain prompt template
self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information.
Core Responsibilities:
1. Language Processing:
- Identify the language of the user's query (English or Gujarati)
- Respond in the same language as the query
- If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
- For technical terms, provide both English and Gujarati versions when relevant
2. Document Understanding:
- Analyze the OCR-processed text from the uploaded {document_type}
- Account for potential OCR errors or misinterpretations
- Focus on extracting accurate information despite possible OCR imperfections
3. Response Guidelines:
- Provide direct, clear answers based solely on the document content
- If information is unclear due to OCR quality, mention this limitation
- For numerical data (dates, percentages, marks), double-check accuracy before responding
- If information is not found in the document, clearly state: "This information is not present in the uploaded document"
4. Educational Context:
- Maintain focus on educational queries related to the document content
- For admission-related queries, emphasize important deadlines and requirements
- For scholarship information, highlight eligibility criteria and application processes
- For course-related queries, provide detailed, accurate information from the document
5. Response Format:
- Structure responses clearly with relevant subpoints when necessary
- For complex information, break down the answer into digestible parts
- Include relevant reference points from the document when applicable
- Format numerical data and dates clearly
6. Quality Control:
- Verify that responses align with the document content
- Don't make assumptions beyond the provided information
- If multiple interpretations are possible due to OCR quality, mention all possibilities
- Maintain consistency in terminology throughout the conversation
Important Rules:
- Never make up information not present in the document
- Don't combine information from previous conversations or external knowledge
- Always indicate if certain parts of the document are unclear due to OCR quality
- Maintain professional tone while being accessible to students and parents
- If the query is out of scope of the uploaded document, politely redirect to relevant official sources
Context from uploaded document:
{context}
Chat History:
{history}
Current Question: {question}
Assistant: Let me provide a clear and accurate response based on the uploaded document content...
"""
self.QA_CHAIN_PROMPT = PromptTemplate(
input_variables=["history", "context", "question"],
template=self.template
)
self.db1 = None
self.qa_chain = None
def ocr_image(self, image_path, language='eng+guj'):
img = Image.open(image_path)
return pytesseract.image_to_string(img, lang=language)
def ocr_pdf(self, pdf_path, language='eng+guj'):
images = convert_from_path(pdf_path)
return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
def process_file(self, uploaded_file):
"""Process an uploaded file and initialize the QA chain."""
_, file_extension = os.path.splitext(uploaded_file.name)
file_extension = file_extension.lower()
# Temporarily save the file for processing
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
temp_file.write(uploaded_file.read())
temp_path = temp_file.name
# OCR processing based on file type
if file_extension == '.pdf':
raw_text = self.ocr_pdf(temp_path, language='guj+eng')
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
raw_text = self.ocr_image(temp_path, language='guj+eng')
else:
return "Unsupported file format."
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text_chunks = text_splitter.split_text(raw_text)
# Create vector store and initialize QA chain
self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
self.qa_chain = RetrievalQA.from_chain_type(
self.llm,
retriever=self.db1.as_retriever(),
chain_type='stuff',
verbose=True,
chain_type_kwargs={
"verbose": True,
"prompt": self.QA_CHAIN_PROMPT,
"memory": self.memory
}
)
return "File processed successfully!"
def get_response(self, user_input):
"""Generate response to the user input question."""
if not self.qa_chain:
return "Please upload and process a file before asking questions."
response = self.qa_chain({"query": user_input})
return response["result"]
# Initialize the chatbot
chatbot = ChatbotModel()
# Define Gradio interface functions
def upload_and_process(file):
return chatbot.process_file(file)
def ask_question(question):
return chatbot.get_response(question)
# Set up Gradio interface
interface = gr.Blocks()
with interface:
gr.Markdown("# Educational Chatbot with Document Analysis")
with gr.Row():
file_upload = gr.File(label="Upload PDF or Image")
upload_btn = gr.Button("Process File")
output = gr.Textbox(label="File Processing Status")
with gr.Row():
question_box = gr.Textbox(label="Ask a Question")
ask_btn = gr.Button("Submit")
answer = gr.Textbox(label="Answer")
# Connect buttons to functions
upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
ask_btn.click(ask_question, inputs=question_box, outputs=answer)
# Launch Gradio interface
interface.launch()
|