Spaces:
Sleeping
Sleeping
File size: 1,452 Bytes
28621b1 4ce10ee 28621b1 7b04950 28621b1 ccf7fbf 28621b1 7b04950 28621b1 6415b25 28621b1 1093e36 28621b1 3897f24 6415b25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import streamlit as st
import bm25s
from operator import itemgetter
import os
import re
import pandas as pd
@st.cache_data
def load_data():
df = pd.read_csv("cleaned_list.csv",header = None)
df.columns = ['document']
corpus = [doc for doc in df['document'].to_list()]
retriever = bm25s.BM25(corpus=corpus)
retriever.index(bm25s.tokenize(corpus))
return retriever
def extract_hscode(text):
match = re.search(r'hs_code:\s*(\d+)', text)
if match:
return match.group(1)
return None
df2 = pd.read_csv("hscode_main.csv")
new_col = [len(str(code))for code in df2['hs_code'].to_list()]
df2['len'] = new_col
new_hscode = [str(code) for code in df2['hs_code']]
for i in range(len(new_col)):
if new_col[i]==5:
new_hscode[i] = '0'+ new_hscode[i]
df2['hs_code'] = new_hscode
df2=df2.drop(columns='len')
if 'retriever' not in st.session_state:
st.session_state.retriever = None
if st.session_state.retriever is None:
st.session_state.retriever = load_data()
sentence = st.text_input("please enter description:")
if sentence !='':
results,_ = st.session_state.retriever.retrieve(bm25s.tokenize(sentence), k=5)
doc = [d for d in results]
hscodes = [extract_hscode(item) for item in doc[0]]
for code in hscodes:
filter_df = df2[df2['hs_code']==code]
answer = filter_df['full_description'].iloc[0]
st.write("Hscode:",code)
st.write("answer:",answer.lower()) |