diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py index 74bd500..df643fa 100644 --- a/ebook_converter/__init__.py +++ b/ebook_converter/__init__.py @@ -9,40 +9,6 @@ from ebook_converter import constants_old from ebook_converter.ebooks.html_entities import html5_entities -def sanitize_file_name(name, substitute='_'): - """ - Sanitize the filename `name`. All invalid characters are replaced by - `substitute`. The set of invalid characters is the union of the invalid - characters in Windows, macOS and Linux. Also removes leading and trailing - whitespace. - - **WARNING:** This function also replaces path separators, so only pass - file names and not full paths to it. - """ - - if isinstance(name, bytes): - name = name.decode(constants_old.filesystem_encoding, 'replace') - if isinstance(substitute, bytes): - substitute = substitute.decode(constants_old.filesystem_encoding, - 'replace') - chars = (substitute - if c in set(('\\', '|', '?', '*', '<', '"', ':', '>', '+', '/') + - tuple(map(chr, range(32)))) else c for c in name) - one = ''.join(chars) - one = re.sub(r'\s', ' ', one).strip() - bname, ext = os.path.splitext(one) - one = re.sub(r'^\.+$', '_', bname) - one = one.replace('..', substitute) - one += ext - # Windows doesn't like path components that end with a period or space - if one and one[-1] in ('.', ' '): - one = one[:-1]+'_' - # Names starting with a period are hidden on Unix - if one.startswith('.'): - one = '_' + one[1:] - return one - - def fit_image(width, height, pwidth, pheight): """ Fit image in box of width pwidth and height pheight. diff --git a/ebook_converter/ebooks/oeb/polish/replace.py b/ebook_converter/ebooks/oeb/polish/replace.py index 3e8fdc8..5d6c47d 100644 --- a/ebook_converter/ebooks/oeb/polish/replace.py +++ b/ebook_converter/ebooks/oeb/polish/replace.py @@ -3,7 +3,7 @@ from functools import partial from collections import Counter, defaultdict import urllib.parse -from ebook_converter import sanitize_file_name +from ebook_converter.utils import filenames as fms from ebook_converter.ebooks.chardet import strip_encoding_declarations from ebook_converter.ebooks.oeb.base import css_text from ebook_converter.ebooks.oeb.polish.css import iter_declarations, remove_property_value @@ -203,7 +203,7 @@ def rename_files(container, file_map): def replace_file(container, name, path, basename, force_mt=None): dirname, base = name.rpartition('/')[0::2] - nname = sanitize_file_name(basename) + nname = fms.sanitize_file_name(basename) if dirname: nname = dirname + '/' + nname with open(path, 'rb') as src: diff --git a/ebook_converter/utils/filenames.py b/ebook_converter/utils/filenames.py index cc7f73d..8294d9b 100644 --- a/ebook_converter/utils/filenames.py +++ b/ebook_converter/utils/filenames.py @@ -4,15 +4,51 @@ meaning as possible. """ import errno import os +import re import shutil from math import ceil -from ebook_converter import force_unicode, sanitize_file_name +from ebook_converter import constants_old +from ebook_converter import force_unicode from ebook_converter.constants_old import (filesystem_encoding, preferred_encoding) from ebook_converter.utils.localization import get_udc +def sanitize_file_name(name, substitute='_'): + """ + Sanitize the filename `name`. All invalid characters are replaced by + `substitute`. The set of invalid characters is the union of the invalid + characters in Windows, macOS and Linux. Also removes leading and trailing + whitespace. + + **WARNING:** This function also replaces path separators, so only pass + file names and not full paths to it. + """ + + if isinstance(name, bytes): + name = name.decode(constants_old.filesystem_encoding, 'replace') + if isinstance(substitute, bytes): + substitute = substitute.decode(constants_old.filesystem_encoding, + 'replace') + chars = (substitute + if c in set(('\\', '|', '?', '*', '<', '"', ':', '>', '+', '/') + + tuple(map(chr, range(32)))) else c for c in name) + one = ''.join(chars) + one = re.sub(r'\s', ' ', one).strip() + bname, ext = os.path.splitext(one) + one = re.sub(r'^\.+$', '_', bname) + one = one.replace('..', substitute) + one += ext + # Windows doesn't like path components that end with a period or space + if one and one[-1] in ('.', ' '): + one = one[:-1]+'_' + # Names starting with a period are hidden on Unix + if one.startswith('.'): + one = '_' + one[1:] + return one + + def ascii_text(orig): udc = get_udc() try: diff --git a/ebook_converter/utils/zipfile.py b/ebook_converter/utils/zipfile.py index 2a780f8..e50e71c 100644 --- a/ebook_converter/utils/zipfile.py +++ b/ebook_converter/utils/zipfile.py @@ -7,7 +7,7 @@ import binascii from contextlib import closing from tempfile import SpooledTemporaryFile -from ebook_converter import sanitize_file_name +from ebook_converter.utils import filenames as fms from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import detect from ebook_converter.polyglot.builtins import as_bytes @@ -1135,7 +1135,7 @@ class ZipFile: os.makedirs(upperdirs) except: # Added by Kovid targetpath = os.path.join(base_target, - sanitize_file_name(fname)) + fms.sanitize_file_name(fname)) upperdirs = os.path.dirname(targetpath) if upperdirs and not os.path.exists(upperdirs): os.makedirs(upperdirs) @@ -1156,7 +1156,7 @@ class ZipFile: except: # Try sanitizing the file name to remove invalid characters components = list(os.path.split(targetpath)) - components[-1] = sanitize_file_name(components[-1]) + components[-1] = fms.sanitize_file_name(components[-1]) targetpath = os.sep.join(components) with open(targetpath, 'wb') as target: shutil.copyfileobj(source, target)