Spaces:

seawolf2357
/

rag

Runtime error

rag

File size: 3,346 Bytes

9918198
44a6b17
 
4cc10ce
456ec91
 
9918198
44a6b17
1f97769
44a6b17
 
883f7e7
44a6b17
75c1fd6
2d84b3b
4cc10ce
44a6b17
 
1b6e08f
 
2d84b3b
44a6b17
1b6e08f
4cc10ce
 
 
1b6e08f
 
 
4cc10ce
953debe
44a6b17
1b6e08f
 
953debe
1b6e08f
 
44a6b17
1b6e08f
 
9918198
953debe
 
44a6b17
 
 
1b6e08f
 
 
 
 
953debe
 
1b6e08f
4cc10ce
 
1b6e08f
 
 
9918198
1b6e08f
 
 
 
 
 
 
9918198
1b6e08f
 
 
 
456ec91
 
9918198
 
 
 
 
4cc10ce
2829eb5
 
9918198

import os
import torch  # torch를 임포트
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import gradio as gr
from accelerate import Accelerator

# 환경 변수에서 Hugging Face API 키 로드
hf_api_key = os.getenv('HF_API_KEY')

# 모델 ID 및 토크나이저 설정
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
accelerator = Accelerator()  # Accelerator 인스턴스 생성

# 모델 로딩
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_api_key,
    torch_dtype=torch.bfloat16,  # torch를 사용해 데이터 타입 지정
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
)
model = accelerator.prepare(model)  # 모델을 Accelerator에 준비시킴

# 데이터 로딩 및 faiss 인덱스 생성
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
dataset = load_dataset("not-lain/wikipedia", revision="embedded")
data = dataset["train"]
data = data.add_faiss_index("embeddings")

# 검색 및 응답 생성 함수
def search(query: str, k: int = 3):
    embedded_query = ST.encode(query)
    scores, retrieved_examples = data.get_nearest_examples("embeddings", embedded_query, k=k)
    return scores, retrieved_examples

# 나머지 코드는 이전과 동일하게 유지


def format_prompt(prompt, retrieved_documents, k):
    PROMPT = f"Question:{prompt}\nContext:"
    for idx in range(k):
        PROMPT += f"{retrieved_documents['text'][idx]}\n"
    return PROMPT

def generate(formatted_prompt):
    formatted_prompt = formatted_prompt[:2000]  # GPU 메모리 제한을 고려
    messages = [{"role": "system", "content": "You are an assistant..."}, {"role": "user", "content": formatted_prompt}]
    input_ids = tokenizer(messages, return_tensors="pt", padding=True).input_ids.to(accelerator.device)
    outputs = model.generate(
        input_ids,
        max_new_tokens=1024,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9
    )
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response

def rag_chatbot_interface(prompt: str, k: int = 2):
    scores, retrieved_documents = search(prompt, k)
    formatted_prompt = format_prompt(prompt, retrieved_documents, k)
    return generate(formatted_prompt)

SYS_PROMPT = "You are an assistant for answering questions. You are given the extracted parts of a long document and a question. Provide a conversational answer. If you don't know the answer, just say 'I do not know.' Don't make up an answer."

iface = gr.Interface(
    fn=rag_chatbot_interface,
    inputs=gr.inputs.Textbox(label="Enter your question"),
    outputs=gr.outputs.Textbox(label="Answer"),
    title="Retrieval-Augmented Generation Chatbot",
    description="This chatbot uses a retrieval-augmented generation approach to provide more accurate answers. It first searches for relevant documents and then generates a response based on the prompt and the retrieved documents."
)

iface.launch()