Spaces:

gmustafa413
/

ChatBot

Sleeping

File size: 8,261 Bytes

!pip install langdetect faiss-cpu transformers gradio groq sentence-transformers pypdf2 python-pptx pandas docx2txt

import gradio as gr
import fitz  # PyMuPDF
import numpy as np
import requests
import faiss
import re
import json
import pandas as pd
from docx import Document  
from pptx import Presentation
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor

# Configuration
GROQ_API_KEY = "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"  # Replace with your actual key
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Proper embedding model
CHUNK_SIZE = 512
MAX_TOKENS = 4096
WORKERS = 8

# Initialize the embedding model
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

class DocumentProcessor:
    def __init__(self):
        self.index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
        self.chunks = []
        self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)

    def extract_text_from_pptx(self, file_path):
        try:
            prs = Presentation(file_path)
            return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
        except Exception as e:
            print(f"PPTX Error: {str(e)}")
            return ""

    def extract_text_from_xls_csv(self, file_path):
        try:
            if file_path.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file_path)
            else:
                df = pd.read_csv(file_path)
            return " ".join(df.astype(str).values.flatten())
        except Exception as e:
            print(f"Spreadsheet Error: {str(e)}")
            return ""

    def extract_text_from_pdf(self, file_path):
        try:
            doc = fitz.open(file_path)
            return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
        except Exception as e:
            print(f"PDF Error: {str(e)}")
            return ""

    def process_file(self, file):
        try:
            file_path = file.name
            print(f"Processing: {file_path}")

            if file_path.endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif file_path.endswith('.docx'):
                text = " ".join(p.text for p in Document(file_path).paragraphs)
            elif file_path.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            elif file_path.endswith('.pptx'):
                text = self.extract_text_from_pptx(file_path)
            elif file_path.endswith(('.xls', '.xlsx', '.csv')):  
                text = self.extract_text_from_xls_csv(file_path)
            else:
                return ""

            clean_text = re.sub(r'\s+', ' ', text).strip()
            print(f"Extracted {len(clean_text)} characters from {file_path}")
            return clean_text
        except Exception as e:
            print(f"Processing Error: {str(e)}")
            return ""

    def semantic_chunking(self, text):
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < CHUNK_SIZE:
                current_chunk += " " + sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks[:1000]  # Limit to 1000 chunks per document

    def process_documents(self, files):
        self.chunks = []
        if not files:
            return "No files uploaded!"

        print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
        texts = list(self.processor_pool.map(self.process_file, files))

        with ThreadPoolExecutor(max_workers=WORKERS) as executor:
            chunk_lists = list(executor.map(self.semantic_chunking, texts))

        all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
        print(f"Total chunks generated: {len(all_chunks)}")

        if not all_chunks:
            return "Error: No chunks generated from documents"

        try:
            embeddings = embedding_model.encode(
                all_chunks,
                batch_size=32,
                convert_to_tensor=True,
                show_progress_bar=False
            ).cpu().numpy().astype('float32')

            self.index.reset()
            self.index.add(embeddings)
            self.chunks = all_chunks
            return f"Processed {len(all_chunks)} chunks from {len(files)} files"
        except Exception as e:
            print(f"Embedding Error: {str(e)}")
            return f"Error: {str(e)}"

    def query(self, question):
        if not self.chunks:
            return "Please process documents first", False

        try:
            print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
            print(f"Question: {question}")

            question_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
            _, indices = self.index.search(question_embedding, 3)
            print(f"Top indices: {indices}")

            context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
            print(f"Context length: {len(context)} characters")

            headers = {
                "Authorization": f"Bearer {GROQ_API_KEY}",
                "Content-Type": "application/json"
            }

            payload = {
                "messages": [{
                    "role": "user",
                    "content": f"Answer concisely based on the context: {question}\nContext: {context}"
                }],
                "model": "mixtral-8x7b-32768",
                "temperature": 0.3,
                "max_tokens": MAX_TOKENS,
                "stream": False  # Changed to False for simpler handling
            }

            response = requests.post(
                "https://api.groq.com/openai/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=20
            )

            print(f"API Status Code: {response.status_code}")

            if response.status_code != 200:
                return f"API Error: {response.text}", False

            data = response.json()
            final_answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            print(f"Final Answer: {final_answer}")
            return final_answer, True

        except Exception as e:
            print(f"Query Error: {str(e)}")
            return f"Error: {str(e)}", False

processor = DocumentProcessor()

def ask_question(question, chat_history):
    if not question.strip():
        return chat_history + [("", "Please enter a valid question")]

    answer, success = processor.query(question)
    return chat_history + [(question, answer)]

with gr.Blocks(title="Document ChatBot") as app:
    gr.Markdown("## 🚀 Multi-Format Document ChatBot")
    with gr.Row():
        files = gr.File(
            file_count="multiple",
            file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
            label="Upload Documents"
        )
        process_btn = gr.Button("Process Documents", variant="primary")
    status = gr.Textbox(label="Processing Status", interactive=False)
    chatbot = gr.Chatbot(height=500, label="Chat History")
    with gr.Row():
        question = gr.Textbox(
            label="Your Query",
            placeholder="Enter your question about the documents...",
            max_lines=3
        )
        ask_btn = gr.Button("Ask", variant="primary")
    clear_btn = gr.Button("Clear Chat")

    process_btn.click(
        fn=processor.process_documents,
        inputs=files,
        outputs=status
    )

    ask_btn.click(
        fn=ask_question,
        inputs=[question, chatbot],
        outputs=chatbot
    ).then(lambda: "", None, question)

    clear_btn.click(
        fn=lambda: [],
        inputs=None,
        outputs=chatbot
    )

app.launch()