|
import os |
|
import gradio as gr |
|
from groq import Groq |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
import PyPDF2 |
|
import re |
|
from collections import Counter |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import logging |
|
|
|
|
|
logging.basicConfig(filename='query_logs.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s') |
|
|
|
|
|
grog_api_key = "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" |
|
|
|
|
|
client = Groq(api_key=grog_api_key) |
|
|
|
|
|
book_path = 'Generative_AI_Foundations_in_Python_Discover_key_techniques_and.pdf' |
|
|
|
|
|
cache = {} |
|
|
|
|
|
if os.path.exists(book_path): |
|
print(f"Book found at: {book_path}") |
|
else: |
|
print("Book not found!") |
|
|
|
|
|
def read_pdf(file_path): |
|
with open(file_path, 'rb') as file: |
|
reader = PyPDF2.PdfReader(file) |
|
number_of_pages = len(reader.pages) |
|
text = "" |
|
for page_num in range(number_of_pages): |
|
page = reader.pages[page_num] |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
book_text = read_pdf(book_path) |
|
print(book_text[:1000]) |
|
|
|
|
|
def vectorize_text(text): |
|
try: |
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
sentences = text.split('\n') |
|
embeddings = model.encode(sentences, show_progress_bar=True) |
|
|
|
|
|
index = faiss.IndexFlatL2(embeddings.shape[1]) |
|
index.add(np.array(embeddings)) |
|
print(f"Added {len(sentences)} sentences to the vector store.") |
|
|
|
return index, sentences |
|
except Exception as e: |
|
print(f"Error during vectorization: {str(e)}") |
|
return None, None |
|
|
|
|
|
vector_index, sentences = vectorize_text(book_text) |
|
|
|
|
|
if vector_index: |
|
print("Vectorization complete.") |
|
else: |
|
print("Vectorization failed.") |
|
|
|
|
|
def generate_query_embedding(query, sentence_transformer_model): |
|
return sentence_transformer_model.encode([query]) |
|
|
|
|
|
def check_relevancy(D, threshold=0.4): |
|
if D[0][0] > threshold: |
|
return False |
|
return True |
|
|
|
|
|
def generate_diverse_responses(client, prompt, n=3): |
|
responses = [] |
|
for i in range(n): |
|
temperature = 0.5 + (i * 0.2) |
|
top_p = 0.9 - (i * 0.2) |
|
try: |
|
chat_completion = client.chat.completions.create( |
|
messages=[{ |
|
"role": "user", |
|
"content": prompt, |
|
}], |
|
model="llama3-8b-8192", |
|
temperature=temperature, |
|
top_p=top_p |
|
) |
|
responses.append(chat_completion.choices[0].message.content) |
|
except Exception as e: |
|
logging.error(f"Error generating response: {str(e)}") |
|
responses.append("Sorry, an error occurred while generating this response.") |
|
return responses |
|
|
|
|
|
def aggregate_responses(responses): |
|
|
|
response_counter = Counter(responses) |
|
most_common_response = response_counter.most_common(1)[0][0] |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
embeddings = model.encode(responses) |
|
first_embedding = embeddings[0].reshape(1, -1) |
|
|
|
similarities = cosine_similarity(first_embedding, embeddings)[0] |
|
top_response_index = np.argmax(similarities) |
|
|
|
|
|
return responses[top_response_index] |
|
|
|
|
|
def generate_answer_with_grog(query, vector_index, sentences, sentence_transformer_model): |
|
|
|
if query in cache: |
|
logging.info(f"Cache hit for query: {query}") |
|
return cache[query] |
|
|
|
try: |
|
|
|
query_embedding = generate_query_embedding(query, sentence_transformer_model) |
|
|
|
|
|
D, I = vector_index.search(np.array(query_embedding), k=5) |
|
|
|
|
|
if len(I[0]) == 0 or D[0][0] > 1.0: |
|
fallback_response = f"I couldn't find anything relevant in the document, but here's a general answer to your query: {query}" |
|
chat_completion = client.chat.completions.create( |
|
messages=[{ |
|
"role": "user", |
|
"content": fallback_response, |
|
}], |
|
model="llama3-8b-8192", |
|
) |
|
cache[query] = chat_completion.choices[0].message.content |
|
return cache[query] |
|
|
|
|
|
relevant_sentences = [sentences[i] for i in I[0]] |
|
|
|
|
|
combined_text = " ".join(relevant_sentences) |
|
|
|
|
|
final_prompt = f"**Relevant Information:**\n\n '{combined_text}'\n\n **Answer:** {query}" |
|
|
|
|
|
responses = generate_diverse_responses(client, final_prompt) |
|
|
|
|
|
final_response = aggregate_responses(responses) |
|
|
|
|
|
cache[query] = final_response |
|
return final_response |
|
|
|
except Exception as e: |
|
logging.error(f"Error during answer generation with groq API: {str(e)}") |
|
return f"Error during answer generation: {str(e)}" |
|
|
|
|
|
def gradio_interface(query): |
|
global vector_index, sentences |
|
|
|
|
|
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
if vector_index is None or sentences is None: |
|
return "Vector index or sentences not initialized properly." |
|
|
|
|
|
answer = generate_answer_with_grog(query, vector_index, sentences, sentence_transformer_model) |
|
|
|
|
|
logging.info(f"Query: {query}, Answer: {answer}") |
|
|
|
return f"### Here's your response:\n\n{answer}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=gr.Textbox(label="Enter your query"), |
|
outputs="markdown", |
|
title="Generative_AI_Foundations_in_Python PDF-based Query Answering", |
|
description="Ask any question about the content in the uploaded PDF and receive diverse, reliable answers." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|