!pip install langdetect faiss-cpu transformers gradio groq sentence-transformers pypdf2 python-pptx pandas docx2txt import gradio as gr import fitz # PyMuPDF import numpy as np import requests import faiss import re import json import pandas as pd from docx import Document from pptx import Presentation from sentence_transformers import SentenceTransformer from concurrent.futures import ThreadPoolExecutor # Configuration GROQ_API_KEY = "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" # Replace with your actual key EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Proper embedding model CHUNK_SIZE = 512 MAX_TOKENS = 4096 WORKERS = 8 # Initialize the embedding model embedding_model = SentenceTransformer(EMBEDDING_MODEL) class DocumentProcessor: def __init__(self): self.index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension()) self.chunks = [] self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS) def extract_text_from_pptx(self, file_path): try: prs = Presentation(file_path) return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]) except Exception as e: print(f"PPTX Error: {str(e)}") return "" def extract_text_from_xls_csv(self, file_path): try: if file_path.endswith(('.xls', '.xlsx')): df = pd.read_excel(file_path) else: df = pd.read_csv(file_path) return " ".join(df.astype(str).values.flatten()) except Exception as e: print(f"Spreadsheet Error: {str(e)}") return "" def extract_text_from_pdf(self, file_path): try: doc = fitz.open(file_path) return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc) except Exception as e: print(f"PDF Error: {str(e)}") return "" def process_file(self, file): try: file_path = file.name print(f"Processing: {file_path}") if file_path.endswith('.pdf'): text = self.extract_text_from_pdf(file_path) elif file_path.endswith('.docx'): text = " ".join(p.text for p in Document(file_path).paragraphs) elif file_path.endswith('.txt'): with open(file_path, 'r', encoding='utf-8') as f: text = f.read() elif file_path.endswith('.pptx'): text = self.extract_text_from_pptx(file_path) elif file_path.endswith(('.xls', '.xlsx', '.csv')): text = self.extract_text_from_xls_csv(file_path) else: return "" clean_text = re.sub(r'\s+', ' ', text).strip() print(f"Extracted {len(clean_text)} characters from {file_path}") return clean_text except Exception as e: print(f"Processing Error: {str(e)}") return "" def semantic_chunking(self, text): sentences = re.split(r'(?