Spaces:
Sleeping
Sleeping
!pip install langdetect faiss-cpu transformers gradio groq sentence-transformers pypdf2 python-pptx pandas docx2txt | |
import gradio as gr | |
import fitz # PyMuPDF | |
import numpy as np | |
import requests | |
import faiss | |
import re | |
import json | |
import pandas as pd | |
from docx import Document | |
from pptx import Presentation | |
from sentence_transformers import SentenceTransformer | |
from concurrent.futures import ThreadPoolExecutor | |
# Configuration | |
GROQ_API_KEY = "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" # Replace with your actual key | |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Proper embedding model | |
CHUNK_SIZE = 512 | |
MAX_TOKENS = 4096 | |
WORKERS = 8 | |
# Initialize the embedding model | |
embedding_model = SentenceTransformer(EMBEDDING_MODEL) | |
class DocumentProcessor: | |
def __init__(self): | |
self.index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension()) | |
self.chunks = [] | |
self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS) | |
def extract_text_from_pptx(self, file_path): | |
try: | |
prs = Presentation(file_path) | |
return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]) | |
except Exception as e: | |
print(f"PPTX Error: {str(e)}") | |
return "" | |
def extract_text_from_xls_csv(self, file_path): | |
try: | |
if file_path.endswith(('.xls', '.xlsx')): | |
df = pd.read_excel(file_path) | |
else: | |
df = pd.read_csv(file_path) | |
return " ".join(df.astype(str).values.flatten()) | |
except Exception as e: | |
print(f"Spreadsheet Error: {str(e)}") | |
return "" | |
def extract_text_from_pdf(self, file_path): | |
try: | |
doc = fitz.open(file_path) | |
return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc) | |
except Exception as e: | |
print(f"PDF Error: {str(e)}") | |
return "" | |
def process_file(self, file): | |
try: | |
file_path = file.name | |
print(f"Processing: {file_path}") | |
if file_path.endswith('.pdf'): | |
text = self.extract_text_from_pdf(file_path) | |
elif file_path.endswith('.docx'): | |
text = " ".join(p.text for p in Document(file_path).paragraphs) | |
elif file_path.endswith('.txt'): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
text = f.read() | |
elif file_path.endswith('.pptx'): | |
text = self.extract_text_from_pptx(file_path) | |
elif file_path.endswith(('.xls', '.xlsx', '.csv')): | |
text = self.extract_text_from_xls_csv(file_path) | |
else: | |
return "" | |
clean_text = re.sub(r'\s+', ' ', text).strip() | |
print(f"Extracted {len(clean_text)} characters from {file_path}") | |
return clean_text | |
except Exception as e: | |
print(f"Processing Error: {str(e)}") | |
return "" | |
def semantic_chunking(self, text): | |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(current_chunk) + len(sentence) < CHUNK_SIZE: | |
current_chunk += " " + sentence | |
else: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks[:1000] # Limit to 1000 chunks per document | |
def process_documents(self, files): | |
self.chunks = [] | |
if not files: | |
return "No files uploaded!" | |
print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40) | |
texts = list(self.processor_pool.map(self.process_file, files)) | |
with ThreadPoolExecutor(max_workers=WORKERS) as executor: | |
chunk_lists = list(executor.map(self.semantic_chunking, texts)) | |
all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list] | |
print(f"Total chunks generated: {len(all_chunks)}") | |
if not all_chunks: | |
return "Error: No chunks generated from documents" | |
try: | |
embeddings = embedding_model.encode( | |
all_chunks, | |
batch_size=32, | |
convert_to_tensor=True, | |
show_progress_bar=False | |
).cpu().numpy().astype('float32') | |
self.index.reset() | |
self.index.add(embeddings) | |
self.chunks = all_chunks | |
return f"Processed {len(all_chunks)} chunks from {len(files)} files" | |
except Exception as e: | |
print(f"Embedding Error: {str(e)}") | |
return f"Error: {str(e)}" | |
def query(self, question): | |
if not self.chunks: | |
return "Please process documents first", False | |
try: | |
print("\n" + "="*40 + " QUERY PROCESSING " + "="*40) | |
print(f"Question: {question}") | |
question_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32') | |
_, indices = self.index.search(question_embedding, 3) | |
print(f"Top indices: {indices}") | |
context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)]) | |
print(f"Context length: {len(context)} characters") | |
headers = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"messages": [{ | |
"role": "user", | |
"content": f"Answer concisely based on the context: {question}\nContext: {context}" | |
}], | |
"model": "mixtral-8x7b-32768", | |
"temperature": 0.3, | |
"max_tokens": MAX_TOKENS, | |
"stream": False # Changed to False for simpler handling | |
} | |
response = requests.post( | |
"https://api.groq.com/openai/v1/chat/completions", | |
headers=headers, | |
json=payload, | |
timeout=20 | |
) | |
print(f"API Status Code: {response.status_code}") | |
if response.status_code != 200: | |
return f"API Error: {response.text}", False | |
data = response.json() | |
final_answer = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
print(f"Final Answer: {final_answer}") | |
return final_answer, True | |
except Exception as e: | |
print(f"Query Error: {str(e)}") | |
return f"Error: {str(e)}", False | |
processor = DocumentProcessor() | |
def ask_question(question, chat_history): | |
if not question.strip(): | |
return chat_history + [("", "Please enter a valid question")] | |
answer, success = processor.query(question) | |
return chat_history + [(question, answer)] | |
with gr.Blocks(title="Document ChatBot") as app: | |
gr.Markdown("## π Multi-Format Document ChatBot") | |
with gr.Row(): | |
files = gr.File( | |
file_count="multiple", | |
file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"], | |
label="Upload Documents" | |
) | |
process_btn = gr.Button("Process Documents", variant="primary") | |
status = gr.Textbox(label="Processing Status", interactive=False) | |
chatbot = gr.Chatbot(height=500, label="Chat History") | |
with gr.Row(): | |
question = gr.Textbox( | |
label="Your Query", | |
placeholder="Enter your question about the documents...", | |
max_lines=3 | |
) | |
ask_btn = gr.Button("Ask", variant="primary") | |
clear_btn = gr.Button("Clear Chat") | |
process_btn.click( | |
fn=processor.process_documents, | |
inputs=files, | |
outputs=status | |
) | |
ask_btn.click( | |
fn=ask_question, | |
inputs=[question, chatbot], | |
outputs=chatbot | |
).then(lambda: "", None, question) | |
clear_btn.click( | |
fn=lambda: [], | |
inputs=None, | |
outputs=chatbot | |
) | |
app.launch() |