File size: 2,791 Bytes
e7abd03
 
716f829
62b261c
716f829
e7abd03
62b261c
02403f4
be520f8
236d6fa
 
 
716f829
c30444f
c2371ad
e7abd03
62b261c
 
 
 
 
 
d0ab9b7
 
62b261c
 
 
49dab71
 
e7abd03
62b261c
 
e7abd03
62b261c
52dd91e
716f829
62b261c
 
 
716f829
b71f887
d0ab9b7
52dd91e
716f829
 
62b261c
7d97b15
be520f8
caf5793
62b261c
be520f8
 
 
 
 
 
7d97b15
be520f8
 
 
62b261c
d0ab9b7
e7abd03
 
716f829
 
53297e2
6f729e6
e7abd03
62b261c
6f729e6
 
716f829
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr
import json
import faiss
import numpy as np
import spaces
# Ensure you have GPU support
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the CSV file with embeddings
df = pd.read_csv('RBDx10kstats.csv')
df['embedding'] = df['embedding'].apply(json.loads)  # Convert JSON string back to list

# Convert embeddings to a numpy array
embeddings = np.array(df['embedding'].tolist(), dtype='float32')

# Setup FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])  # dimension should match the embedding size
index.add(embeddings)

# Load the Sentence Transformer model
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

# Load the LLaMA model for response generation
llama_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
llama_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(device)

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if device == 'cuda' else -1)

# Define the function to find the most relevant document using FAISS
@spaces.GPU(duration=120)
def retrieve_relevant_doc(query):
    query_embedding = sentence_model.encode(query, convert_to_tensor=False)
    _, indices = index.search(np.array([query_embedding]), k=1)
    best_match_idx = indices[0][0]
    return df.iloc[best_match_idx]['Abstract']

# Define the function to generate a response
@spaces.GPU(duration=120)
def generate_response(query):
    relevant_doc = retrieve_relevant_doc(query)
    if len(relevant_doc) > 512:  # Truncate long documents
        relevant_doc = summarizer(relevant_doc, max_length=4096, min_length=50, do_sample=False)[0]['summary_text']
    
    input_text = f"Document: {relevant_doc}\n\nQuestion: {query}\n\nAnswer:"
    inputs = llama_tokenizer(input_text, return_tensors="pt").to(device)
    
    # Set pad_token_id to eos_token_id to avoid the warning
    pad_token_id = llama_tokenizer.eos_token_id
    outputs = llama_model.generate(
        inputs["input_ids"], 
        attention_mask=inputs["attention_mask"], 
        max_length=512, 
        pad_token_id=pad_token_id
    )
    
    response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Create a Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."),
    outputs="text",
    title="RAG Chatbot",
    description="This chatbot retrieves relevant documents based on your query and generates responses using LLaMA."
)

# Launch the Gradio interface
iface.launch()