mbuali's picture
Upload folder using huggingface_hub
d1ceb73 verified
"""
Module that extracts attachments from notebooks into their own files
"""
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
import os
from base64 import b64decode
from traitlets import Bool, Unicode
from .base import Preprocessor
class ExtractAttachmentsPreprocessor(Preprocessor):
"""
Extracts attachments from all (markdown and raw) cells in a notebook.
The extracted attachments are stored in a directory ('attachments' by default).
https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
"""
attachments_directory_template = Unicode(
"{notebook_name}_attachments",
help="Directory to place attachments if use_separate_dir is True",
).tag(config=True)
use_separate_dir = Bool(
False,
help="Whether to use output_files_dir (which ExtractOutput also uses) or "
"create a separate directory for attachments",
).tag(config=True)
def __init__(self, **kw):
"""
Public constructor
"""
super().__init__(**kw)
# directory path,
self.path_name = "" # will be set in self.preprocess, needs resources
# Where extracted attachments are stored in resources
self.resources_item_key = (
"attachments" # Here as a default, in case someone doesn't want to call preprocess
)
# Add condition and configurability here
def preprocess(self, nb, resources):
"""
Determine some settings and apply preprocessor to notebook
"""
if self.use_separate_dir:
self.path_name = self.attachments_directory_template.format(
notebook_name=resources["unique_key"]
)
# Initialize resources for attachments
resources["attachment_files_dir"] = self.path_name
resources["attachments"] = {}
self.resources_item_key = "attachments"
else:
# Use same resources as ExtractOutput
self.path_name = resources["output_files_dir"]
self.resources_item_key = "outputs"
# Make sure key exists
if not isinstance(resources[self.resources_item_key], dict):
resources[self.resources_item_key] = {}
nb, resources = super().preprocess(nb, resources)
return nb, resources
def preprocess_cell(self, cell, resources, index):
"""
Extract attachments to individual files and
change references to them.
E.g.
'![image.png](attachment:021fdd80.png)'
becomes
'![image.png]({path_name}/021fdd80.png)'
Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
"""
if "attachments" in cell:
for fname in cell.attachments:
self.log.debug("Encountered attachment %s", fname)
# Add file for writer
# Right now I don't know of a situation where there would be multiple
# mime types under same filename, and I can't index into it without the mimetype.
# So I only read the first one.
for mimetype in cell.attachments[fname]:
# convert to bytes and decode
data = cell.attachments[fname][mimetype].encode("utf-8")
decoded = b64decode(data)
break
# FilesWriter wants path to be in attachment filename here
new_filename = os.path.join(self.path_name, fname)
resources[self.resources_item_key][new_filename] = decoded
# Edit the reference to the attachment
# os.path.join on windows uses "\\" separator,
# but files like markdown still want "/"
if os.path.sep != "/":
new_filename = new_filename.replace(os.path.sep, "/")
cell.source = cell.source.replace("attachment:" + fname, new_filename)
return cell, resources