File size: 3,147 Bytes
749fb56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import datetime
import json
from typing import List, Tuple

from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
import streamlit as st
from sudachipy import dictionary, tokenizer


def generate_word_ngrams(
    text: str, min_len: int, max_len: int, binary: bool = False
) -> List[Tuple[str, ...]]:
    """
    Tokenize the input text into words and generate n-grams of specified lengths.

    Args:
        text (str): The input string.
        min_len (int): The minimum length of the n-grams.
        max_len (int): The maximum length of the n-grams.
        binary (bool, optional): If True, remove duplicates. Defaults to False.

    Returns:
        List[Tuple[str, ...]]: A list of n-grams as tuples of words.
    """
    tokenizer_obj = dictionary.Dictionary(dict="full").create()
    mode = tokenizer.Tokenizer.SplitMode.A
    tokens = tokenizer_obj.tokenize(text, mode)
    words = [token.surface() for token in tokens]

    ngrams: List[Tuple[str, ...]] = []

    for n in range(min_len, max_len + 1):
        for k in range(len(words) - n + 1):
            ngram = tuple(words[k:k + n])
            ngrams.append(ngram)

    if binary:
        ngrams = list(set(ngrams))  # Remove duplicates

    return ngrams


def preprocess_func(text: str) -> List[str]:
    ngrams = generate_word_ngrams(text, 1, 1, True)
    return [' '.join(ngram) for ngram in ngrams]


def load_docs_from_json(json_path):
    with open(json_path) as f:
        papers = json.load(f)

    docs = []
    for paper in papers:
        page_content = f"Title: {paper['ptitle']}\n\nAbstract: {paper['abstract']}"
        doc = Document(
            page_content=page_content,
            metadata={
                'session_id': paper['session_id'],
                'session_title': paper['session_title'],
                'session_info': paper['session_info'],
                'id': paper['pid'],
                'title': paper['ptitle'],
                'pdf_link': paper['pdf_link'],
                'authors': paper['pauthors'],
                }
            )
        docs.append(doc)
    
    return docs


# init
json_path = "nlp2024_papers.json"
docs = load_docs_from_json(json_path)
retriever = BM25Retriever.from_documents(docs, preprocess_func=preprocess_func)
retriever.k = 10

# streamlit
st.title("NLP2024 Papers Search")
st.markdown(f"Search papers from [NLP2024](https://www.anlp.jp/nlp2024/).")
st.markdown(f"Nmber of documents: `{len(docs)}`.")
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n  but also with documents like \"How to generate synthetic data using LLM.\"")

prompt = st.chat_input("Search anything...")

if prompt:
    results = retriever.invoke(prompt)
    
    st.markdown(f"Top `{len(results)}` related papers")

    for result in results:
        with st.expander(label=result.metadata['title'], expanded=False):
            for k in result.metadata:
                st.write(f"{k}: {result.metadata[k]}")
            st.divider()
            st.markdown(result.page_content)