HAOUARI Noureddine
commited on
Commit
·
d0341c5
1
Parent(s):
11fb0d6
better version 02
Browse files
app.py
CHANGED
@@ -11,8 +11,6 @@ encoding_anthropic = client.get_tokenizer()
|
|
11 |
|
12 |
# Model choice and max tokens input
|
13 |
model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
|
14 |
-
max_tokens = st.sidebar.number_input(
|
15 |
-
"Max number of tokens per chunk", min_value=100, value=8000)
|
16 |
|
17 |
|
18 |
def clean_text_content(text):
|
@@ -61,30 +59,60 @@ def pdf_to_text(pdf_files_data, file_names):
|
|
61 |
return results
|
62 |
|
63 |
|
64 |
-
st.title("PDF
|
65 |
-
st.markdown(
|
66 |
-
"Upload PDF files and get their content in text format splitted based on the max tokens.")
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
uploaded_files = st.sidebar.file_uploader(
|
70 |
-
"Upload PDF files", type="pdf", accept_multiple_files=True)
|
71 |
-
|
72 |
-
clean_text = st.sidebar.checkbox("Clean text before encoding and splitting?")
|
73 |
-
|
74 |
-
# Check if the text is not already in session_state
|
75 |
if "text_content" not in st.session_state:
|
76 |
st.session_state.text_content = ""
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
if st.
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
if clean_text:
|
90 |
st.session_state.text_content = clean_text_content(
|
@@ -95,16 +123,8 @@ if uploaded_files:
|
|
95 |
chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
|
96 |
chunk_tokens) for chunk_tokens in chunks_generator]
|
97 |
|
98 |
-
# Display each chunk in a separate text area
|
99 |
for i, chunk in enumerate(chunks, 1):
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
if model_choice == "OpenAI":
|
105 |
-
num_tokens = len(encoding_openAI.encode(st.session_state.text_content))
|
106 |
-
st.write(f"Total number of tokens (OpenAI): {num_tokens}")
|
107 |
-
else:
|
108 |
-
tokens_count = len(encoding_anthropic.encode(
|
109 |
-
st.session_state.text_content))
|
110 |
-
st.write(f"Total number of tokens (Anthropic): {tokens_count}")
|
|
|
11 |
|
12 |
# Model choice and max tokens input
|
13 |
model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
|
|
|
|
|
14 |
|
15 |
|
16 |
def clean_text_content(text):
|
|
|
59 |
return results
|
60 |
|
61 |
|
62 |
+
st.title("PDF Utility")
|
|
|
|
|
63 |
|
64 |
+
# Create tabs
|
65 |
+
step01 = "Step 01: Upload Files"
|
66 |
+
step02 = "Step 02: Edit Knowledge Base"
|
67 |
+
step03 = "Step 03: Split text"
|
68 |
+
tabs = [step01, step02, step03]
|
69 |
+
selected_tab = st.sidebar.radio("Choose a tab", tabs)
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
if "text_content" not in st.session_state:
|
72 |
st.session_state.text_content = ""
|
73 |
|
74 |
+
# Define content for each tab
|
75 |
+
if selected_tab == step02:
|
76 |
+
st.subheader("Knowledge Base Text Area")
|
77 |
+
st.session_state.text_content = st.text_area(
|
78 |
+
"Knowledge Text Area", st.session_state.text_content, height=400)
|
79 |
+
if st.button("Compute Tokens"):
|
80 |
+
if model_choice == "OpenAI":
|
81 |
+
num_tokens = len(encoding_openAI.encode(
|
82 |
+
st.session_state.text_content))
|
83 |
+
st.write(f"Total number of tokens (OpenAI): {num_tokens}")
|
84 |
+
else:
|
85 |
+
tokens_count = len(encoding_anthropic.encode(
|
86 |
+
st.session_state.text_content))
|
87 |
+
st.write(f"Total number of tokens (Anthropic): {tokens_count}")
|
88 |
+
elif selected_tab == step01:
|
89 |
+
st.subheader("Upload PDFs to Append to Knowledge Base")
|
90 |
+
|
91 |
+
uploaded_files = st.file_uploader(
|
92 |
+
"Upload PDF files", type="pdf", accept_multiple_files=True)
|
93 |
+
if uploaded_files:
|
94 |
+
pdf_files_data = [io.BytesIO(uploaded_file.read())
|
95 |
+
for uploaded_file in uploaded_files]
|
96 |
+
file_names = [uploaded_file.name for uploaded_file in uploaded_files]
|
97 |
+
|
98 |
+
if st.button('Convert and add to knowledge database'):
|
99 |
+
converting_message = st.text("Converting PDFs...")
|
100 |
+
converted_text = "\n".join(pdf_to_text(pdf_files_data, file_names))
|
101 |
+
st.session_state.text_content += converted_text
|
102 |
+
converting_message.empty()
|
103 |
+
|
104 |
+
elif selected_tab == step03:
|
105 |
+
st.subheader("Splitting Options")
|
106 |
+
|
107 |
+
model_choice = st.selectbox(
|
108 |
+
"Choose a Model", ["OpenAI", "Anthropic"], key="model_choice_selectbox")
|
109 |
+
max_tokens = st.number_input(
|
110 |
+
"Max number of tokens per chunk", min_value=100, value=8000, key="max_tokens_input")
|
111 |
+
clean_text = st.checkbox("Clean text before encoding and splitting?")
|
112 |
+
|
113 |
+
# Add prefix and postfix input options
|
114 |
+
prefix = st.text_area("Prefix for each chunk:", "")
|
115 |
+
postfix = st.text_area("Postfix for each chunk:", "")
|
116 |
|
117 |
if clean_text:
|
118 |
st.session_state.text_content = clean_text_content(
|
|
|
123 |
chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
|
124 |
chunk_tokens) for chunk_tokens in chunks_generator]
|
125 |
|
|
|
126 |
for i, chunk in enumerate(chunks, 1):
|
127 |
+
# Add prefix and postfix to each chunk
|
128 |
+
chunk_with_affixes = f"{prefix}{chunk}{postfix}"
|
129 |
+
chunk_content = st.text_area(
|
130 |
+
f"Chunk {i} content:", chunk_with_affixes, height=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|