import gradio as gr import PyPDF2 from transformers import AutoTokenizer, AutoModel import torch import weaviate import cohere auth_config = weaviate.AuthApiKey(api_key="TWEMtQTPxPhVp4Jwv9hWCFiut6u46sfjioGE") client = weaviate.Client( "https://l5zyhqlrsjw4bowu3pg9g.c0.us-west3.gcp.weaviate.cloud", auth_client_secret=auth_config ) cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8") def load_pdf(file): reader = PyPDF2.PdfReader(file) text = '' for page in range(len(reader.pages)): text += reader.pages[page].extract_text() return text tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') def get_embeddings(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy() return embeddings def upload_document_chunks(chunks): for idx, chunk in enumerate(chunks): embedding = get_embeddings(chunk) client.data_object.create( {"content": chunk}, "Document", vector=embedding.tolist() ) def query_answer(query): query_embedding = get_embeddings(query) result = client.query.get("Document", ["content"])\ .with_near_vector({"vector": query_embedding.tolist()})\ .with_limit(3)\ .do() return result def generate_response(context, query): response = cohere_client.generate( model='command', prompt=f"Context: {context}\n\nQuestion: {query}?\nAnswer:", max_tokens=100 ) return response.generations[0].text.strip() def qa_pipeline(pdf_file, query): document_text = load_pdf(pdf_file) document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)] upload_document_chunks(document_chunks) response = query_answer(query) context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']]) answer = generate_response(context, query) return context, answer with gr.Blocks(theme="compact") as demo: gr.Markdown( """
Upload a PDF document, ask questions, and receive answers based on the document content.