File size: 3,735 Bytes
82b49ce
 
 
 
 
 
 
 
 
a7d22b6
 
82b49ce
 
a7d22b6
82b49ce
 
39a94dc
82b49ce
 
a7d22b6
82b49ce
 
 
 
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
 
 
 
 
 
 
a7d22b6
82b49ce
 
 
 
d384d9d
82b49ce
 
a7d22b6
82b49ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import requests
from io import BytesIO
from PyPDF2 import PdfReader
from tempfile import NamedTemporaryFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq
import gradio as gr

# Initialize Groq client
client = Groq(api_key="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")

# Predefined list of Google Drive links
drive_links = [
    "https://drive.google.com/file/d/1x83IIMfuFPFuCzZiRJfT0obBf9PUWHA2/view",
    # Add more links here as needed
]

# Function to download PDF from Google Drive
def download_pdf_from_drive(drive_link):
    file_id = drive_link.split('/d/')[1].split('/')[0]
    download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
    response = requests.get(download_url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        raise Exception("Failed to download the PDF file from Google Drive.")

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_stream):
    pdf_reader = PdfReader(pdf_stream)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# Function to create embeddings and store them in FAISS
def create_embeddings_and_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_db = FAISS.from_texts(chunks, embedding=embeddings)
    return vector_db

# Function to query the vector database and interact with Groq
def query_vector_db(query, vector_db):
    # Retrieve relevant documents
    docs = vector_db.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in docs])

    # Interact with Groq API
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"Use the following context:\n{context}"},
            {"role": "user", "content": query},
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Process the predefined Google Drive links
def process_drive_links():
    all_chunks = []
    for link in drive_links:
        try:
            # Download PDF
            pdf_stream = download_pdf_from_drive(link)
            # Extract text
            text = extract_text_from_pdf(pdf_stream)
            # Chunk text
            chunks = chunk_text(text)
            all_chunks.extend(chunks)
        except Exception as e:
            return f"Error processing link {link}: {e}"
    
    if all_chunks:
        # Generate embeddings and store in FAISS
        vector_db = create_embeddings_and_store(all_chunks)
        return vector_db
    return None

# Gradio interface
vector_db = process_drive_links()

def gradio_query_interface(user_query):
    if vector_db is None:
        return "Error: Could not process Google Drive links."
    if not user_query:
        return "Please enter a query."
    response = query_vector_db(user_query, vector_db)
    return response

iface = gr.Interface(
    fn=gradio_query_interface,
    inputs=gr.Textbox(label="Enter your query:"),
    outputs=gr.Textbox(label="Response from LLM:"),
    title="BISE Buddy - A RAG-Based Application with Google Drive Support",
    description="This application processes predefined Google Drive links, extracts text, and uses embeddings for querying."
)

iface.launch()