Spaces:
Runtime error
Runtime error
File size: 6,902 Bytes
9834964 cf33f35 9834964 d5ccfc3 9834964 2fc06de 9834964 2fc06de 9834964 cf33f35 2fc06de 9834964 e0a6c5a cf33f35 e0a6c5a 9834964 cf33f35 e0a6c5a cf33f35 9834964 e0a6c5a 9834964 2fc06de 9834964 e0a6c5a 9834964 e0a6c5a 27a2422 9834964 e0a6c5a cf33f35 e0a6c5a 2fc06de e0a6c5a 2fc06de e0a6c5a 9a74dff e0a6c5a cf33f35 e0a6c5a 9834964 2fc06de e0a6c5a cf33f35 e0a6c5a 9834964 2fc06de cf33f35 e0a6c5a cf33f35 e0a6c5a cf33f35 e0a6c5a 9834964 e0a6c5a cf33f35 9834964 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import streamlit as st
import pandas as pd
from pathlib import Path
import requests
import base64
from requests.auth import HTTPBasicAuth
import torch
st.set_page_config(layout="wide")
@st.cache(allow_output_mutation=True)
def load_model():
from transformers import (
EncoderDecoderModel,
AutoTokenizer,
)
PRETRAINED = "raynardj/wenyanwen-ancient-translate-to-modern"
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
model = EncoderDecoderModel.from_pretrained(PRETRAINED)
return tokenizer, model
tokenizer, model = load_model()
def inference(text):
print(f"from: {text}")
tk_kwargs = dict(
truncation=True,
max_length=168,
padding="max_length",
return_tensors='pt')
inputs = tokenizer([text, ], **tk_kwargs)
with torch.no_grad():
new = tokenizer.batch_decode(
model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
num_beams=3,
max_length=256,
bos_token_id=101,
eos_token_id=tokenizer.sep_token_id,
pad_token_id=tokenizer.pad_token_id,
), skip_special_tokens=True)[0].replace(" ", "")
print(f"to: {new}")
return new
@st.cache
def get_file_df():
file_df = pd.read_csv("meta.csv")
return file_df
file_df = get_file_df()
st.sidebar.title("【隨無涯】")
st.sidebar.markdown("""
* 朕自庖[🤗 模型](https://huggingface.co/raynardj/wenyanwen-ancient-translate-to-modern), [⭐️ 訓習處](https://github.com/raynardj/yuan)
* 📚 充棟汗牛,取自[殆知閣](http://www.daizhige.org/),[github api](https://github.com/garychowcmu/daizhigev20)
""")
c2 = st.container()
c2.write("The entirety of ancient Chinese literature, with a modern translator at your side.")
st.markdown("""---""")
c = st.container()
USER_ID = st.secrets["USER_ID"]
SECRET = st.secrets["SECRET"]
@st.cache
def get_maps():
file_obj_hash_map = dict(file_df[["filepath", "obj_hash"]].values)
file_size_map = dict(file_df[["filepath", "fsize"]].values)
return file_obj_hash_map, file_size_map
file_obj_hash_map, file_size_map = get_maps()
def show_file_size(size: int):
if size < 1024:
return f"{size} B"
elif size < 1024*1024:
return f"{size//1024} KB"
else:
return f"{size/1024//1024} MB"
@st.cache(max_entries=100, allow_output_mutation=True)
def fetch_file(path):
# reading from local path first
if (Path("data")/path).exists():
with open(Path("data")/path, "r") as f:
return f.read()
# read from github api
obj_hash = file_obj_hash_map[path]
auth = HTTPBasicAuth(USER_ID, SECRET)
url = f"https://api.github.com/repos/garychowcmu/daizhigev20/git/blobs/{obj_hash}"
print(f"requesting {url}")
r = requests.get(url, auth=auth)
if r.status_code == 200:
data = r.json()
content = base64.b64decode(data['content']).decode('utf-8')
return content
else:
r.raise_for_status()
@st.cache(allow_output_mutation=True, max_entries=100)
def fetch_from_df(sub_paths: str = ""):
sub_df = file_df.copy()
for idx, step in enumerate(sub_paths):
sub_df.query(f"col_{idx} == '{step}'", inplace=True)
if len(sub_df) == 0:
return None
return list(sub_df[f"col_{len(sub_paths)}"].unique())
def show_filepath(filepath: str):
text = fetch_file(filepath)
c.markdown(
f"""<pre style='white-space:pre-wrap;max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True)
if st.sidebar.selectbox(label="何以尋跡 How to search",options=["以類尋書 category","書名求書 search"])=="以類尋書 category":
# root_data = fetch_from_github()
if 'pathway' in st.session_state:
pass
else:
st.session_state.pathway = []
path_text = st.sidebar.text("/".join(st.session_state.pathway))
def reset_path():
st.session_state.pathway = []
path_text.text(st.session_state.pathway)
if st.sidebar.button("還至初錄(back to root)"):
reset_path()
def display_tree():
sublist = fetch_from_df(st.session_state.pathway)
dropdown = st.sidebar.selectbox("【擇書 choose】", options=sublist)
with st.spinner("書非借不能讀也..."):
st.session_state.pathway.append(dropdown)
if dropdown.endswith('.txt'):
filepath = "/".join(st.session_state.pathway)
file_size = file_size_map[filepath]
with st.spinner(f"Load 載文:{filepath},({show_file_size(file_size)})"):
# if file size is too large, we will not load it
if file_size > 3*1024*1024:
print(f"skip {filepath}")
urlpath = filepath.replace(".txt", ".html")
dzg = f"http://www.daizhige.org/{urlpath}"
st.markdown(f"File too big 其文碩而難載,不能為之,[往 殆知閣]({dzg}), 或擇他書")
reset_path()
return None
path_text.text(filepath)
print(f"read {filepath}")
text = fetch_file(filepath)
# create markdown with max heights
c.markdown(
f"""<pre style='white-space:pre-wrap;max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True
)
reset_path()
else:
sub_list = fetch_from_df(
st.session_state.pathway)
path_text.text("/".join(st.session_state.pathway))
display_tree()
display_tree()
else:
def search_kw():
result = file_df[file_df.filepath.str.contains(st.session_state.kw)].reset_index(drop=True)
if len(result) == 0:
st.sidebar.write(f"尋之不得:{st.session_state.kw}")
else:
filepath = st.sidebar.selectbox("選一書名", options=list(result.head(15).filepath))
show_filepath(filepath)
def loading_with_search():
kw = st.sidebar.text_input("書名求書 Search", value="楞伽经")
st.session_state.kw = kw
search_kw()
loading_with_search()
def translate_text():
if c2.button("【曉文達義 Translate】"):
if cc:
if len(cc) > 168:
c2.write(f"句甚長 不得過百又六十八字 Sentence too long, should be less than 168 characters")
else:
c2.markdown(f"""```{inference(cc)}```""")
else:
c2.write("【入難曉之文字 Please input sentence for translating】")
cc = c2.text_area("【入難曉之文字 Input sentence】", height=150)
translate_text()
|