Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _safe_name(file_name, sep):
"""Convert the file name to ASCII and normalize the string."""
file_name = stringify(file_name)
if file_name is None:
return
file_name = ascii_text(file_name)
file_name = category_replace(file_name, UNICODE_CATEGORIES)
file_name = collapse_spaces(file_name)
if file_name is None or not len(file_name):
return
return file_name.replace(WS, sep)
def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
encoding_default=DEFAULT_ENCODING, encoding=None,
replace_categories=UNICODE_CATEGORIES):
"""The main normalization function for text.
This will take a string and apply a set of transformations to it so
that it can be processed more easily afterwards. Arguments:
* ``lowercase``: not very mysterious.
* ``collapse``: replace multiple whitespace-like characters with a
single whitespace. This is especially useful with category replacement
which can lead to a lot of whitespace.
* ``decompose``: apply a unicode normalization (NFKD) to separate
simple characters and their diacritics.
* ``replace_categories``: This will perform a replacement of whole
classes of unicode characters (e.g. symbols, marks, numbers) with a
given character. It is used to replace any non-text elements of the
input string.
"""
def category_replace(text, replacements=UNICODE_CATEGORIES):
"""Remove characters from a string based on unicode classes.
This is a method for removing non-text characters (such as punctuation,
whitespace, marks and diacritics) from a piece of text by class, rather
than specifying them individually.
"""
if not is_text(text):
return None
characters = []
for character in decompose_nfkd(text):
cat = unicodedata.category(character)
replacement = replacements.get(cat, character)
if replacement is not None:
characters.append(replacement)
return u''.join(characters)