File size: 3,434 Bytes
d16656d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# Step 1: Read the CSV file
df = pd.read_csv('./all_combine_main.csv')  # Ensure the CSV file is uploaded to Hugging Face

# Step 2: Extract the text column for embeddings and keep the entire DataFrame rows
text_column = 'detail_โครงการ'  # Replace 'your_column' with your text column name
texts = df[text_column].astype(str).tolist()

# Keep the entire DataFrame rows as a list of dictionaries
rows = df.to_dict('records')

# Step 3: Load the pre-trained model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Step 4: Generate embeddings for all texts
embeddings = model.encode(texts, show_progress_bar=True)

# Optional: Save embeddings to disk to avoid recomputing in future runs
# np.save('embeddings.npy', embeddings)
# If you have saved embeddings before, you can load them directly
# embeddings = np.load('embeddings.npy')

# Step 5: Define the semantic search function
def semantic_search(query, embeddings, texts, rows, top_n=5):
    # Generate embedding for the query
    query_embedding = model.encode([query])
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get the indices of the top_n most similar texts
    top_indices = np.argsort(similarities)[::-1][:top_n]
    
    # Return the top_n most similar rows and their similarity scores
    results = [(rows[idx], similarities[idx]) for idx in top_indices]
    return results

# Step 6: Create the Gradio interface
def search_interface(query):
    # Perform the search
    results = semantic_search(query, embeddings, texts, rows)
    
    # Specify the columns to display
    columns_to_display = ['ชื่อกระทรวง', 'งบประมาณปี68', 'ชื่อสำนักงาน', 'งบประมาณปี68_สำนักงาน', 'ชื่อโครงการ', 'งบประมาณ_โครงการ']
    
    # Prepare the output
    output = ""
    for row, score in results:
        output += f"**Score:** {score:.4f}\n\n"
        
        # Check if either 'ชื่อโครงการ' or 'งบประมาณ_โครงการ' is NaN
       
        
        # Display only specified columns
        for key, value in row.items():
            if key in columns_to_display and not pd.isna(value):  # Only show specified columns and skip NaNs
                output += f"**{key}:** {value}\n\n"
        if pd.isna(row.get('ชื่อโครงการ')) or pd.isna(row.get('งบประมาณ_โครงการ')):
        # Display 'detail_โครงการ' if any of the above columns are NaN
            output += f"**detail_โครงการ:** {row.get('detail_โครงการ')}\n\n"
        output += "---\n\n"
    
    return output

iface = gr.Interface(
    fn=search_interface,
    inputs=gr.Textbox(lines=2, placeholder='Enter your search query here...'),
    outputs="markdown",
    title="Semantic Search Application",
    description="Enter a search query to find the most relevant entries from the dataset.",
)

if __name__ == "__main__":
    iface.launch(share=True)