File size: 1,551 Bytes
d82542d
45d6b11
d82542d
2014880
d82542d
3a29d4a
2014880
45d6b11
2014880
3a29d4a
 
45d6b11
4829b97
45d6b11
2014880
d82542d
45d6b11
2014880
 
 
d69bc63
45d6b11
 
2014880
 
 
 
 
 
 
 
 
45d6b11
 
d82542d
 
45d6b11
 
 
2014880
 
d82542d
 
 
d69bc63
d82542d
 
 
 
 
 
 
 
 
 
2014880
 
d82542d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import argparse
import logging

import datasets
import gradio as gr
import sentence_transformers

logging.disable(logging.CRITICAL)

model = sentence_transformers.SentenceTransformer(
    "dangvantuan/sentence-camembert-large", device="cpu")

dataset = datasets.load_dataset("json", data_files=["./dataset.json"], split="train")
dataset.load_faiss_index("embeddings", "index.faiss")

def search(query, k):
    query_embedding = model.encode(query)
    _, retrieved_examples = dataset.get_nearest_examples(
        "embeddings",
        query_embedding,
        k=int(k),
    )
    results = []
    for text, start, end, title, url in zip(
        retrieved_examples["text"],
        retrieved_examples["start"],
        retrieved_examples["end"],
        retrieved_examples["title"],
        retrieved_examples["url"],
    ):
        start = start
        end = end
        result = {
            "title": title,
            "transcript": f"[{str(start)} ====> {str(end)}] {text}",
            "link": url,
        }
        results.append(result)
    return results

iface = gr.Interface(
    search,
    inputs=[
        gr.inputs.Textbox(label="Query"),
        gr.inputs.Number(label="K", default=3),
    ],
    outputs=[
        gr.outputs.Textbox(label="Title"),
        gr.outputs.Textbox(label="Transcript"),
        gr.outputs.Textbox(label="Link"),
    ],
    title="Camembert and Faiss-powered Search Engine",
    description="Search through a dataset using Camembert and Faiss",
    theme="light",
    layout="vertical",
)

iface.launch()