File size: 2,575 Bytes
3c4a7fb
69d4a53
bb2f2e7
3c4a7fb
 
 
4289de9
6ecdc04
3c4a7fb
 
 
 
 
 
 
 
 
 
 
 
6ecdc04
3c4a7fb
bdeb96a
3c4a7fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb2f2e7
 
 
3c4a7fb
bb2f2e7
bdeb96a
2714936
bdeb96a
3c4a7fb
 
bb2f2e7
 
 
bea368c
 
 
 
 
bb2f2e7
 
3c4a7fb
bea368c
bb2f2e7
 
 
3c4a7fb
bb2f2e7
 
 
3c4a7fb
bb2f2e7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
import faiss
import streamlit as st
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import torch

def list_to_numpy(obj):
    if isinstance(obj, list):
        return np.array(obj)
    return obj

def load_documents_from_jsonl(embeddings_model, jsonl_path, createEmbeddings=False):
    tqdm.pandas(desc="Loading Data")
    df = pd.read_json(jsonl_path, lines=True).progress_apply(lambda x: x)
    df.columns = ['Question' if 'Question' in col else 'Answer' if 'Answer' in col else col for col in df.columns]
    return df
       
def generate_embeddings(tokenizer, model, text):
    with torch.no_grad():
        embeddings = model.encode(text, convert_to_tensor=True)
    return embeddings.cpu().numpy()

def save_to_faiss(df):
    dimension = len(df['Embeddings'].iloc[0])
    db = faiss.IndexFlatL2(dimension)
    embeddings = np.array(df['Embeddings'].tolist()).astype('float32')
    db.add(embeddings)
    faiss.write_index(db, "faiss_index")

def search_in_faiss(query_vector, df, k=5):
    db = faiss.read_index("faiss_index")
    query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
    distances, indices = db.search(query_vector, k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        answer_text = df.iloc[idx]['Answer']
        dist = np.sqrt(dist)
        results.append({"Answer": answer_text, "Distance": dist})

    return results

def main():
    # Заголовок приложения
    st.title("Demo for LLAMA-2 RAG with CPU only")

    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

    df_qa = load_documents_from_jsonl(model, 'ExportForAI1.jsonl', False)     
    save_to_faiss(df_qa)
    
    # Текстовое поле для ввода вопроса
    input_text = st.text_input("Input", "")

    dataList = [
        {"Answer": "", "Distance": 0},
        {"Answer": "", "Distance": 0},
        {"Answer": "", "Distance": 0}
]
    # Кнопка "Answer"
    if st.button("Answer"):
        query_vector = model.encode(input_text.lower())
        dataList = search_in_faiss(query_vector, df_qa, k=3)
        pass

    # Таблица с данными
    st.write("Most relevants answers")
    st.table(dataList)

    # Текстовое поле для вывода текста
    st.write("LLAMA generated answer:")
    text_output = st.text_area("", "")

# Запуск основной части приложения
if __name__ == "__main__":
    main()