Source code for slangmod.etl.encoding

import re
from swak.misc import ArgRepr
from ..config import config

__all__ = [
    'EncodingEnforcer',
    'enforce_encoding'
]


[docs] class EncodingEnforcer(ArgRepr): """Force a text into an encoding, replacing unrepresentable characters. Parameters ---------- encoding: str The target encoding. Chose one from `the list of built-in codecs <https://docs.python.org/3/library/codecs.html#standard-encodings>`_. repl: str, optional The string to replace unrepresentable characters with. Defaults to a single space (" "). """ pattern = re.compile(r'\\N{.+?}') def __init__(self, encoding: str, repl: str = ' ') -> None: self.encoding = encoding self.repl = repl super().__init__(encoding, repl)
[docs] def __call__(self, text: str) -> str: """Replace unrepresentable characters in the given text. Parameters ---------- text: str The text to force into the specified encoding by replacing unrepresentable characters. Returns ------- str The text in the target encoding with unrepresentable characters replaced. """ replaced = text.encode( self.encoding, 'namereplace' ).decode(self.encoding) return self.pattern.sub(self.repl, replaced)
# Provide a ready-to use instance of the EncodingEnforcer enforce_encoding = EncodingEnforcer(encoding=config.files.encoding)