File size: 2,973 Bytes
3c4a7fb
bb2f2e7
3c4a7fb
 
 
6ecdc04
 
 
3c4a7fb
bb2f2e7
3c4a7fb
 
 
bb2f2e7
3c4a7fb
 
 
 
 
 
 
 
 
 
 
6ecdc04
3c4a7fb
bdeb96a
 
 
 
3c4a7fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb2f2e7
 
 
3c4a7fb
bb2f2e7
bdeb96a
 
 
2714936
bdeb96a
3c4a7fb
 
bb2f2e7
 
 
 
 
3c4a7fb
 
bb2f2e7
 
 
3c4a7fb
bb2f2e7
 
 
3c4a7fb
bb2f2e7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import json
import streamlit as st
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
#from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

dataList = [
    {"Answer": "", "Distance": 0},
    {"Answer": "", "Distance": 0},
    {"Answer": "", "Distance": 0}
]
def list_to_numpy(obj):
    if isinstance(obj, list):
        return np.array(obj)
    return obj

def load_documents_from_jsonl(embeddings_model, jsonl_path, createEmbeddings=False):
    tqdm.pandas(desc="Loading Data")
    df = pd.read_json(jsonl_path, lines=True).progress_apply(lambda x: x)
    df.columns = ['Question' if 'Question' in col else 'Answer' if 'Answer' in col else col for col in df.columns]
    return df
       
def generate_embeddings(tokenizer, model, text):
    with torch.no_grad():
        embeddings = model.encode(text, convert_to_tensor=True)
#    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
#    with torch.no_grad():
#        embeddings = model(**encoded_input)
    return embeddings.cpu().numpy()

def save_to_faiss(df):
    dimension = len(df['Embeddings'].iloc[0])
    db = faiss.IndexFlatL2(dimension)
    embeddings = np.array(df['Embeddings'].tolist()).astype('float32')
    db.add(embeddings)
    faiss.write_index(db, "faiss_index")

def search_in_faiss(query_vector, df, k=5):
    db = faiss.read_index("faiss_index")
    query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
    distances, indices = db.search(query_vector, k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        answer_text = df.iloc[idx]['Answer']
        dist = np.sqrt(dist)
        results.append({"Answer": answer_text, "Distance": dist})

    return results

def main():
    # Заголовок приложения
    st.title("Demo for LLAMA-2 RAG with CPU only")

    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    #tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    #model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

    df_qa = load_documents_from_jsonl(model, 'ExportForAI1.jsonl', False)     
    save_to_faiss(df_qa)
    
    # Текстовое поле для ввода вопроса
    input_text = st.text_input("Input", "")

    # Кнопка "Answer"
    if st.button("Answer"):
        query_vector = model.encode(input_text.lower())
        dataList = search_in_faiss(query_vector, df_embed, k=3)
        pass

    # Таблица с данными
    st.write("Most relevants answers")
    st.table(dataList)

    # Текстовое поле для вывода текста
    st.write("LLAMA generated answer:")
    text_output = st.text_area("", "")

# Запуск основной части приложения
if __name__ == "__main__":
    main()