File size: 2,132 Bytes
f3a1940
 
f17e764
e4ce8fe
f3a1940
e4ce8fe
f17e764
e4ce8fe
 
 
 
f17e764
b8028b3
 
f17e764
 
 
 
 
5ca912a
127b334
 
5ca912a
f17e764
2a6aea4
 
f17e764
dda6e4a
 
f17e764
 
 
 
 
 
 
 
 
 
 
 
fa25391
 
5ca912a
e0bcbc8
 
f17e764
5ca912a
f17e764
d7f99ce
5ca912a
f17e764
6cad12f
f3a1940
6cad12f
f3a1940
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import streamlit as st

# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    st.write(filtered_df.shape)
    st.write(project_df.shape)
    st.write(similarity_matrix.shape)

    # Ensure the matrix is in a suitable format for manipulation
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)

    # Get indices from dataframes
    filtered_df_indices = filtered_df.index.to_list()
    project_df_indices = project_df.index.to_list()

    # Create mapping dictionaries
    filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
    project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}

    # Select submatrix based on indices from both dataframes
    #match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
    match_matrix = similarity_matrix[np.ix_(filtered_df_indices, project_df_indices)]

    st.write(match_matrix.shape)

    # Get the linear indices of the top 'top_x' values
    # (flattened index to handle the sparse matrix more effectively)
    linear_indices = np.argsort(match_matrix.data)[-top_x:]
    if len(linear_indices) < top_x:
        top_x = len(linear_indices)

    # Convert flat indices to 2D indices using the shape of the submatrix
    top_indices = np.unravel_index(linear_indices, match_matrix.shape)

    # Get the corresponding similarity values
    top_values = match_matrix.data[linear_indices]

    top_filtered_df_indices = [filtered_df_index_map[i] for i in top_indices[0]]
    top_project_df_indices = [project_df_index_map[i] for i in top_indices[1]]

    st.write(top_filtered_df_indices)

    # Create resulting dataframes with top matches and their similarity scores
    p1_df = filtered_df.loc[top_filtered_df_indices].copy()
    p1_df['similarity'] = top_values

    p2_df = project_df.loc[top_project_df_indices].copy()
    p2_df['similarity'] = top_values
    print("finished calc matches")

    return p1_df, p2_df