HAOUARI Noureddine commited on
Commit
11fb0d6
·
1 Parent(s): d952b7c

better version

Browse files
Files changed (2) hide show
  1. app.py +79 -14
  2. requirements.txt +0 -0
app.py CHANGED
@@ -3,43 +3,108 @@ from concurrent.futures import ThreadPoolExecutor
3
  import streamlit as st
4
  import io
5
  from anthropic import Anthropic
 
 
6
  client = Anthropic()
 
 
7
 
8
- st.sidebar.title("API Configuration")
9
- api_key = st.sidebar.text_input("Enter your Open API key:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def convert_pdf_to_text(pdf_file_data, file_name):
13
  text = "\n---\n"
14
  text += f"file name: {file_name}\n content: \n"
15
  pdf_reader = PdfReader(pdf_file_data)
16
- # Extract all text at once
17
  text += "".join([page.extract_text() for page in pdf_reader.pages])
18
  text += "\n---\n"
19
  return text
20
 
21
 
22
  def pdf_to_text(pdf_files_data, file_names):
23
- # Create a ThreadPoolExecutor to run the conversion in parallel
24
  with ThreadPoolExecutor() as executor:
25
- # Use the executor to map the convert_pdf_to_text function over all the pdf_files_data
26
  results = executor.map(convert_pdf_to_text, pdf_files_data, file_names)
27
-
28
  return results
29
 
30
 
31
- st.title("PDF to Text Converter")
32
- st.markdown("Upload PDF files and get their content in text format.")
 
 
33
 
34
- uploaded_files = st.file_uploader(
35
  "Upload PDF files", type="pdf", accept_multiple_files=True)
36
 
 
 
 
 
 
 
37
  if uploaded_files:
38
  pdf_files_data = [io.BytesIO(uploaded_file.read())
39
  for uploaded_file in uploaded_files]
40
  file_names = [uploaded_file.name for uploaded_file in uploaded_files]
41
- if st.button('Convert'):
42
- with st.spinner('Converting PDFs...'):
43
- text = "\n".join(pdf_to_text(pdf_files_data, file_names))
44
- st.text_area("Text content:", text, height=200)
45
- st.write(f"Number of tokens: {client.count_tokens(text)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import streamlit as st
4
  import io
5
  from anthropic import Anthropic
6
+ import tiktoken
7
+ import re
8
  client = Anthropic()
9
+ encoding_openAI = tiktoken.get_encoding("cl100k_base")
10
+ encoding_anthropic = client.get_tokenizer()
11
 
12
+ # Model choice and max tokens input
13
+ model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
14
+ max_tokens = st.sidebar.number_input(
15
+ "Max number of tokens per chunk", min_value=100, value=8000)
16
+
17
+
18
+ def clean_text_content(text):
19
+ # Keep only English letters, numbers, spaces, line breaks, and common punctuation/symbols
20
+ cleaned_text = re.sub(r'[^a-zA-Z0-9 \r\n.,;!?()\-\'\"&+:%$#@*]', '', text)
21
+ return cleaned_text
22
+
23
+
24
+ def create_chunks(text, n, tokenizer_name):
25
+ """Returns successive n-sized chunks from provided text."""
26
+ tokenizer = encoding_openAI if tokenizer_name == "OpenAI" else encoding_anthropic
27
+ encoded = tokenizer.encode(text)
28
+
29
+ # Check for type of token and adapt accordingly
30
+ tokens = encoded.ids if hasattr(encoded, "ids") else encoded
31
+
32
+ i = 0
33
+ while i < len(tokens):
34
+ # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
35
+ j = min(i + int(1.5 * n), len(tokens))
36
+ while j > i + int(0.5 * n):
37
+ # Decode the tokens and check for full stop or newline
38
+ chunk = tokenizer.decode(tokens[i:j])
39
+ if chunk.endswith(".") or chunk.endswith("\n"):
40
+ break
41
+ j -= 1
42
+ # If no end of sentence found, use n tokens as the chunk size
43
+ if j == i + int(0.5 * n):
44
+ j = min(i + n, len(tokens))
45
+ yield tokens[i:j]
46
+ i = j
47
 
48
 
49
  def convert_pdf_to_text(pdf_file_data, file_name):
50
  text = "\n---\n"
51
  text += f"file name: {file_name}\n content: \n"
52
  pdf_reader = PdfReader(pdf_file_data)
 
53
  text += "".join([page.extract_text() for page in pdf_reader.pages])
54
  text += "\n---\n"
55
  return text
56
 
57
 
58
  def pdf_to_text(pdf_files_data, file_names):
 
59
  with ThreadPoolExecutor() as executor:
 
60
  results = executor.map(convert_pdf_to_text, pdf_files_data, file_names)
 
61
  return results
62
 
63
 
64
+ st.title("PDF splitter")
65
+ st.markdown(
66
+ "Upload PDF files and get their content in text format splitted based on the max tokens.")
67
+
68
 
69
+ uploaded_files = st.sidebar.file_uploader(
70
  "Upload PDF files", type="pdf", accept_multiple_files=True)
71
 
72
+ clean_text = st.sidebar.checkbox("Clean text before encoding and splitting?")
73
+
74
+ # Check if the text is not already in session_state
75
+ if "text_content" not in st.session_state:
76
+ st.session_state.text_content = ""
77
+
78
  if uploaded_files:
79
  pdf_files_data = [io.BytesIO(uploaded_file.read())
80
  for uploaded_file in uploaded_files]
81
  file_names = [uploaded_file.name for uploaded_file in uploaded_files]
82
+
83
+ if st.sidebar.button('Convert'):
84
+ converting_message = st.sidebar.text("Converting PDFs...")
85
+ converted_text = "\n".join(pdf_to_text(pdf_files_data, file_names))
86
+ st.session_state.text_content += converted_text
87
+ converting_message.empty()
88
+
89
+ if clean_text:
90
+ st.session_state.text_content = clean_text_content(
91
+ st.session_state.text_content)
92
+
93
+ chunks_generator = create_chunks(
94
+ st.session_state.text_content, max_tokens, model_choice)
95
+ chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
96
+ chunk_tokens) for chunk_tokens in chunks_generator]
97
+
98
+ # Display each chunk in a separate text area
99
+ for i, chunk in enumerate(chunks, 1):
100
+ chunk_content = st.text_area(f"Chunk {i} content:", chunk, height=200)
101
+
102
+ # Button to compute tokens of the text area content
103
+ if st.button("Compute Tokens"):
104
+ if model_choice == "OpenAI":
105
+ num_tokens = len(encoding_openAI.encode(st.session_state.text_content))
106
+ st.write(f"Total number of tokens (OpenAI): {num_tokens}")
107
+ else:
108
+ tokens_count = len(encoding_anthropic.encode(
109
+ st.session_state.text_content))
110
+ st.write(f"Total number of tokens (Anthropic): {tokens_count}")
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ