File size: 2,499 Bytes
998cded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search
import torch

model = SentenceTransformer("sentence-transformers/gtr-t5-large")


#  Read files
url = "https://gist.githubusercontent.com/fer-aguirre/b6bdcf59ecae41f84765f72114de9fd1/raw/b4e029fe236c1f38275621686429b2c7aaa3d18b/embeddings.csv"

df_emb = pd.read_csv(url, index_col=0)

df = pd.read_csv('./foia_sample.csv')

dataset = Dataset.from_pandas(df_emb)

dataset_embeddings = torch.from_numpy(dataset.to_pandas().to_numpy()).to(torch.float)

st.markdown("**Inserta una solicitud de información para generar recomendaciones de dependencias**")

if request := st.text_area("", value=""):

    output = model.encode(request)

    query_embeddings = torch.FloatTensor(output)

    hits = semantic_search(query_embeddings, dataset_embeddings, top_k=3)

    id1 = hits[0][0]['corpus_id']
    id2 = hits[0][1]['corpus_id']
    id3 = hits[0][2]['corpus_id']

    rec1 = df.iloc[id1].str.split(pat="/")[0]
    rec2 = df.iloc[id2].str.split(pat="/")[0]
    rec3 = df.iloc[id3].str.split(pat="/")[0]

    list_rec = [rec1, rec2, rec3]
    unique_list = []
    for string in list_rec:
        if string not in unique_list:
            unique_list.append(string)
    st.markdown(f'Recomendaciones:')
    for rec in unique_list:
        st.markdown(f':green[{rec[0]}]')

st.markdown("""---""")

if st.button('Genera un ejemplo random'):

    test_example = df['combined'].sample(n=1)
    index = test_example.index
    idx = index[0]

    original = df.iloc[idx].str.split(pat="/")[0]

    request = test_example.to_string(index=False)

    st.text(f'{idx}, {request}')

    output = model.encode(request)

    query_embeddings = torch.FloatTensor(output)

    hits = semantic_search(query_embeddings, dataset_embeddings, top_k=3) 

    id1 = hits[0][0]['corpus_id']
    id2 = hits[0][1]['corpus_id']
    id3 = hits[0][2]['corpus_id']

    rec1 = df.iloc[id1].str.split(pat="/")[0]
    rec2 = df.iloc[id2].str.split(pat="/")[0]
    rec3 = df.iloc[id3].str.split(pat="/")[0]

    list_rec = [rec1, rec2, rec3]
    unique_list = []
    for string in list_rec:
        if string not in unique_list:
            unique_list.append(string)
    st.markdown(f'Recomendaciones:')
    for rec in unique_list:
        st.markdown(f':green[{rec[0]}]')
    st.markdown(f'Dependencia original:')
    st.markdown(f':red[{original[0]}]')