File size: 2,250 Bytes
4d82c20
166c1e2
d99b731
 
 
714ea85
 
d99b731
1716bb2
d99b731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d82c20
d99b731
 
 
 
 
 
 
1716bb2
9bf2bb2
1716bb2
 
c1dba3f
1716bb2
 
 
 
 
d99b731
 
 
1716bb2
d99b731
 
 
 
714ea85
d99b731
714ea85
8f1d664
d99b731
 
 
4d82c20
b74c568
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import os
import pinecone
import time
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
import sentence_transformers
from langchain.vectorstores import Pinecone
from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
# device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    # model_kwargs={'device': device},
    # encode_kwargs={'device': device, 'batch_size': 32}
)

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment=os.environ.get('PINECONE_ENVIRONMENT')
)
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

index_name = 'llama-rag'

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(
#         index_name,
#         dimension=len(embeddings[0]),
#         metric='cosine'
#     )
#     # wait for index to finish initialization
#     while not pinecone.describe_index(index_name).status['ready']:
#         time.sleep(1)

index = pinecone.Index(index_name)
index.describe_index_stats()


text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)


API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
headers = {"Authorization": f"Bearer {os.environ.get('API_KEY')}"}
llm = HuggingFaceTextGenInference(
    inference_server_url=API_URL,
    max_new_tokens=1024,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
)

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

def question(question):
    global chatbot
    answer = rag_pipeline(question)
    chatbot = answer
    return answer['result']

demo = gr.Interface(fn=question, inputs="text", outputs="text")
    
if __name__ == "__main__":
    demo.launch()