Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -92,7 +92,7 @@ def extract_text_from_pptx(pptx_data, clean=True):
|
|
92 |
text = clean_text(text)
|
93 |
return text, len(text)
|
94 |
|
95 |
-
def read_document(file_path, clean=True):
|
96 |
with open(file_path, "rb") as f:
|
97 |
file_content = f.read()
|
98 |
|
@@ -159,8 +159,8 @@ def read_document(file_path, clean=True):
|
|
159 |
soup = BeautifulSoup(file_content, 'html.parser')
|
160 |
structured_data = {
|
161 |
"Texts": extract_texts(soup),
|
162 |
-
"Links": extract_links(soup,
|
163 |
-
"Images": extract_images(soup,
|
164 |
}
|
165 |
return format_detailed_output(structured_data), 0
|
166 |
except Exception as e:
|
@@ -204,7 +204,7 @@ def download_and_process_file(url, clean=True):
|
|
204 |
if kind and kind.mime.startswith('image/'):
|
205 |
return f"![]({url})", 0 # Return markdown image syntax if it's an image
|
206 |
else:
|
207 |
-
return read_document(temp_filename, clean) # Otherwise, process as a document
|
208 |
|
209 |
except requests.exceptions.MissingSchema:
|
210 |
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
|
|
|
92 |
text = clean_text(text)
|
93 |
return text, len(text)
|
94 |
|
95 |
+
def read_document(file_path, clean=True, url=""):
|
96 |
with open(file_path, "rb") as f:
|
97 |
file_content = f.read()
|
98 |
|
|
|
159 |
soup = BeautifulSoup(file_content, 'html.parser')
|
160 |
structured_data = {
|
161 |
"Texts": extract_texts(soup),
|
162 |
+
"Links": extract_links(soup, url),
|
163 |
+
"Images": extract_images(soup, url)
|
164 |
}
|
165 |
return format_detailed_output(structured_data), 0
|
166 |
except Exception as e:
|
|
|
204 |
if kind and kind.mime.startswith('image/'):
|
205 |
return f"![]({url})", 0 # Return markdown image syntax if it's an image
|
206 |
else:
|
207 |
+
return read_document(temp_filename, clean, url) # Otherwise, process as a document
|
208 |
|
209 |
except requests.exceptions.MissingSchema:
|
210 |
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
|