Spaces:
Sleeping
Sleeping
File size: 3,735 Bytes
82b49ce a7d22b6 82b49ce a7d22b6 82b49ce 39a94dc 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce a7d22b6 82b49ce d384d9d 82b49ce a7d22b6 82b49ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import requests
from io import BytesIO
from PyPDF2 import PdfReader
from tempfile import NamedTemporaryFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq
import gradio as gr
# Initialize Groq client
client = Groq(api_key="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")
# Predefined list of Google Drive links
drive_links = [
"https://drive.google.com/file/d/1x83IIMfuFPFuCzZiRJfT0obBf9PUWHA2/view",
# Add more links here as needed
]
# Function to download PDF from Google Drive
def download_pdf_from_drive(drive_link):
file_id = drive_link.split('/d/')[1].split('/')[0]
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
response = requests.get(download_url)
if response.status_code == 200:
return BytesIO(response.content)
else:
raise Exception("Failed to download the PDF file from Google Drive.")
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_stream):
pdf_reader = PdfReader(pdf_stream)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return text_splitter.split_text(text)
# Function to create embeddings and store them in FAISS
def create_embeddings_and_store(chunks):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_texts(chunks, embedding=embeddings)
return vector_db
# Function to query the vector database and interact with Groq
def query_vector_db(query, vector_db):
# Retrieve relevant documents
docs = vector_db.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Interact with Groq API
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": f"Use the following context:\n{context}"},
{"role": "user", "content": query},
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Process the predefined Google Drive links
def process_drive_links():
all_chunks = []
for link in drive_links:
try:
# Download PDF
pdf_stream = download_pdf_from_drive(link)
# Extract text
text = extract_text_from_pdf(pdf_stream)
# Chunk text
chunks = chunk_text(text)
all_chunks.extend(chunks)
except Exception as e:
return f"Error processing link {link}: {e}"
if all_chunks:
# Generate embeddings and store in FAISS
vector_db = create_embeddings_and_store(all_chunks)
return vector_db
return None
# Gradio interface
vector_db = process_drive_links()
def gradio_query_interface(user_query):
if vector_db is None:
return "Error: Could not process Google Drive links."
if not user_query:
return "Please enter a query."
response = query_vector_db(user_query, vector_db)
return response
iface = gr.Interface(
fn=gradio_query_interface,
inputs=gr.Textbox(label="Enter your query:"),
outputs=gr.Textbox(label="Response from LLM:"),
title="BISE Buddy - A RAG-Based Application with Google Drive Support",
description="This application processes predefined Google Drive links, extracts text, and uses embeddings for querying."
)
iface.launch()
|