HAOUARI Noureddine commited on
Commit
d0341c5
·
1 Parent(s): 11fb0d6

better version 02

Browse files
Files changed (1) hide show
  1. app.py +53 -33
app.py CHANGED
@@ -11,8 +11,6 @@ encoding_anthropic = client.get_tokenizer()
11
 
12
  # Model choice and max tokens input
13
  model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
14
- max_tokens = st.sidebar.number_input(
15
- "Max number of tokens per chunk", min_value=100, value=8000)
16
 
17
 
18
  def clean_text_content(text):
@@ -61,30 +59,60 @@ def pdf_to_text(pdf_files_data, file_names):
61
  return results
62
 
63
 
64
- st.title("PDF splitter")
65
- st.markdown(
66
- "Upload PDF files and get their content in text format splitted based on the max tokens.")
67
 
 
 
 
 
 
 
68
 
69
- uploaded_files = st.sidebar.file_uploader(
70
- "Upload PDF files", type="pdf", accept_multiple_files=True)
71
-
72
- clean_text = st.sidebar.checkbox("Clean text before encoding and splitting?")
73
-
74
- # Check if the text is not already in session_state
75
  if "text_content" not in st.session_state:
76
  st.session_state.text_content = ""
77
 
78
- if uploaded_files:
79
- pdf_files_data = [io.BytesIO(uploaded_file.read())
80
- for uploaded_file in uploaded_files]
81
- file_names = [uploaded_file.name for uploaded_file in uploaded_files]
82
-
83
- if st.sidebar.button('Convert'):
84
- converting_message = st.sidebar.text("Converting PDFs...")
85
- converted_text = "\n".join(pdf_to_text(pdf_files_data, file_names))
86
- st.session_state.text_content += converted_text
87
- converting_message.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  if clean_text:
90
  st.session_state.text_content = clean_text_content(
@@ -95,16 +123,8 @@ if uploaded_files:
95
  chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
96
  chunk_tokens) for chunk_tokens in chunks_generator]
97
 
98
- # Display each chunk in a separate text area
99
  for i, chunk in enumerate(chunks, 1):
100
- chunk_content = st.text_area(f"Chunk {i} content:", chunk, height=200)
101
-
102
- # Button to compute tokens of the text area content
103
- if st.button("Compute Tokens"):
104
- if model_choice == "OpenAI":
105
- num_tokens = len(encoding_openAI.encode(st.session_state.text_content))
106
- st.write(f"Total number of tokens (OpenAI): {num_tokens}")
107
- else:
108
- tokens_count = len(encoding_anthropic.encode(
109
- st.session_state.text_content))
110
- st.write(f"Total number of tokens (Anthropic): {tokens_count}")
 
11
 
12
  # Model choice and max tokens input
13
  model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
 
 
14
 
15
 
16
  def clean_text_content(text):
 
59
  return results
60
 
61
 
62
+ st.title("PDF Utility")
 
 
63
 
64
+ # Create tabs
65
+ step01 = "Step 01: Upload Files"
66
+ step02 = "Step 02: Edit Knowledge Base"
67
+ step03 = "Step 03: Split text"
68
+ tabs = [step01, step02, step03]
69
+ selected_tab = st.sidebar.radio("Choose a tab", tabs)
70
 
 
 
 
 
 
 
71
  if "text_content" not in st.session_state:
72
  st.session_state.text_content = ""
73
 
74
+ # Define content for each tab
75
+ if selected_tab == step02:
76
+ st.subheader("Knowledge Base Text Area")
77
+ st.session_state.text_content = st.text_area(
78
+ "Knowledge Text Area", st.session_state.text_content, height=400)
79
+ if st.button("Compute Tokens"):
80
+ if model_choice == "OpenAI":
81
+ num_tokens = len(encoding_openAI.encode(
82
+ st.session_state.text_content))
83
+ st.write(f"Total number of tokens (OpenAI): {num_tokens}")
84
+ else:
85
+ tokens_count = len(encoding_anthropic.encode(
86
+ st.session_state.text_content))
87
+ st.write(f"Total number of tokens (Anthropic): {tokens_count}")
88
+ elif selected_tab == step01:
89
+ st.subheader("Upload PDFs to Append to Knowledge Base")
90
+
91
+ uploaded_files = st.file_uploader(
92
+ "Upload PDF files", type="pdf", accept_multiple_files=True)
93
+ if uploaded_files:
94
+ pdf_files_data = [io.BytesIO(uploaded_file.read())
95
+ for uploaded_file in uploaded_files]
96
+ file_names = [uploaded_file.name for uploaded_file in uploaded_files]
97
+
98
+ if st.button('Convert and add to knowledge database'):
99
+ converting_message = st.text("Converting PDFs...")
100
+ converted_text = "\n".join(pdf_to_text(pdf_files_data, file_names))
101
+ st.session_state.text_content += converted_text
102
+ converting_message.empty()
103
+
104
+ elif selected_tab == step03:
105
+ st.subheader("Splitting Options")
106
+
107
+ model_choice = st.selectbox(
108
+ "Choose a Model", ["OpenAI", "Anthropic"], key="model_choice_selectbox")
109
+ max_tokens = st.number_input(
110
+ "Max number of tokens per chunk", min_value=100, value=8000, key="max_tokens_input")
111
+ clean_text = st.checkbox("Clean text before encoding and splitting?")
112
+
113
+ # Add prefix and postfix input options
114
+ prefix = st.text_area("Prefix for each chunk:", "")
115
+ postfix = st.text_area("Postfix for each chunk:", "")
116
 
117
  if clean_text:
118
  st.session_state.text_content = clean_text_content(
 
123
  chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
124
  chunk_tokens) for chunk_tokens in chunks_generator]
125
 
 
126
  for i, chunk in enumerate(chunks, 1):
127
+ # Add prefix and postfix to each chunk
128
+ chunk_with_affixes = f"{prefix}{chunk}{postfix}"
129
+ chunk_content = st.text_area(
130
+ f"Chunk {i} content:", chunk_with_affixes, height=200)