Spaces:
Running
Running
mvansegbroeck
commited on
Commit
•
1420dd0
1
Parent(s):
c2923d2
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
import os
|
|
|
4 |
import markdownify
|
5 |
import fitz # PyMuPDF
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -32,6 +33,15 @@ def markdown_to_text(md_path):
|
|
32 |
with open(md_path, 'r') as file:
|
33 |
return file.read()
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# Function to split text into chunks
|
36 |
def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
|
37 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
@@ -91,9 +101,10 @@ def process_files(uploaded_files, use_example, chunk_size, chunk_overlap, min_ch
|
|
91 |
text = markdown_to_text(file_path)
|
92 |
else:
|
93 |
text = ""
|
94 |
-
|
95 |
markdown_text = markdownify.markdownify(text)
|
96 |
file_id = os.path.splitext(os.path.basename(file_path))[0]
|
|
|
97 |
markdown_path = os.path.join(output_dir, f"{file_id}.md")
|
98 |
with open(markdown_path, 'w') as file:
|
99 |
file.write(markdown_text)
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
import os
|
4 |
+
import re
|
5 |
import markdownify
|
6 |
import fitz # PyMuPDF
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
33 |
with open(md_path, 'r') as file:
|
34 |
return file.read()
|
35 |
|
36 |
+
def sanitize_key(filename):
|
37 |
+
# Replace spaces with underscores
|
38 |
+
filename = filename.replace(" ", "_")
|
39 |
+
# Remove special characters except for underscores
|
40 |
+
filename = re.sub(r'[^a-zA-Z0-9_]', '', filename)
|
41 |
+
# Ensure the key is not too long
|
42 |
+
filename = filename[:100] # Truncate to 100 characters if necessary
|
43 |
+
return filename
|
44 |
+
|
45 |
# Function to split text into chunks
|
46 |
def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
|
47 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
101 |
text = markdown_to_text(file_path)
|
102 |
else:
|
103 |
text = ""
|
104 |
+
|
105 |
markdown_text = markdownify.markdownify(text)
|
106 |
file_id = os.path.splitext(os.path.basename(file_path))[0]
|
107 |
+
file_id = sanitize_key(file_id)
|
108 |
markdown_path = os.path.join(output_dir, f"{file_id}.md")
|
109 |
with open(markdown_path, 'w') as file:
|
110 |
file.write(markdown_text)
|