Kato-DB / app.py
konbraphat51
base
eb24142
raw
history blame
3.94 kB
import streamlit as st
import pandas as pd
import pathlib
import whoosh
import whoosh.index
import whoosh.query
import os
from datetime import date as Date
import re
DATA_FOLDER = pathlib.Path(__file__).parent / "Data"
RAW_FOLDER = DATA_FOLDER / "Transcription_raw"
INDEX_FOLDER = DATA_FOLDER / "Transcription_index"
class Searcher:
def __init__(self):
self.ix = self.make_total_ix()
self.df_video_links = self.get_video_links()
def make_total_ix(self):
ixes_sub = []
index_dir_list = os.listdir(INDEX_FOLDER)
for name in index_dir_list:
if name.startswith("sub"):
ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name))
ix = MultiIndexSearcher(ixes_sub)
return ix
def search(self, date_start, date_end, **kwargs):
titles = self.ix.search(**kwargs)
#(index, date, title)
contents = []
for title in titles:
index = int(title.split("m")[0])
row = self.df_video_links.iloc[index]
title = row["title"]
date = row["date"]
date_datetime = Date(*map(int, date.split("/")))
if not (date_start <= date_datetime <= date_end):
continue
contents.append((date_datetime, index, date, title))
#order by date_datetime
contents.sort()
#remove date_datetime
contents = [(index, date, title) for _, index, date, title in contents]
return contents
def get_video_links(self):
return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0)
def get_content(self, index):
#正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得
folder_list = os.listdir(RAW_FOLDER)
pattern = re.compile(r"{}-\d+.csv".format(index))
matched = [name for name in folder_list if pattern.match(name)]
matched.sort()
best = matched[-1]
return pd.read_csv(RAW_FOLDER / best)
class MultiIndexSearcher:
def __init__(self, ixes):
self.ixes = ixes
def search(self, **kwargs):
titles = []
for ix in self.ixes:
with ix.searcher() as sub_searcher:
hits = sub_searcher.search(**kwargs, limit = None)
for hit in hits:
titles.append(hit["title"])
return titles
searcher = Searcher()
def main():
global searcher
st.title("KATO DB")
keyword = st.text_input(
"検索したいキーワードを入力して、Enterを押してください\n"\
"空欄だと全文書表示します。"
)
date_start = st.date_input(
"検索したい開始日付を入力してください",
Date(2009, 1, 1)
)
date_end = st.date_input(
"検索したい終了日付を入力してください",
Date(2050, 12, 31)
)
#make query
if keyword == "":
query = whoosh.query.Every()
else:
#AND search
keyword_list = keyword.split()
query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list])
contents = searcher.search(q = query, date_start=date_start, date_end=date_end)
st.write("該当件数:{}件".format(len(contents)))
results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"])
st.dataframe(results, hide_index=True)
selected_index = st.selectbox("管理番号を選択して書き起こしを表示", results["管理番号"])
if selected_index is not None:
df_transcription = searcher.get_content(selected_index)
st.dataframe(df_transcription, width=1000)
if __name__ == "__main__":
main()