Source code for slangmod.io.tokenizer

from pathlib import Path
from swak.misc import ArgRepr
from tokenizers import Tokenizer
from ..config import config
from ..ml import Algo, tokenizer

__all__ = [
    'TokenizerSaver',
    'TokenizerLoader',
    'save_tokenizer',
    'load_tokenizer'
]


[docs] class TokenizerSaver(ArgRepr): """Convenience wrapper around a Tokenizer's or Algo's ``save`` method. Parameters ---------- path: str Path (including file name) to save the tokenizer to. May include any number of string placeholders (i.e., pairs of curly brackets) that will be interpolated when instances are called. Defaults to the current working directory of the python interpreter. create: bool, optional What to do if the directory where the tokenizer should be saved does not exist. Defaults to ``False``. """ def __init__(self, path: str = '', create: bool = False) -> None: self.path = str(path).strip() self.create = create super().__init__(self.path, create)
[docs] def __call__(self, algo: Tokenizer | Algo, *parts: str) -> tuple[()]: """Save a Tokenizer or Algo to file. Parameters ---------- algo: Tokenizer or Algo The tokenizer to save. *parts: str, optional Fragments that will be interpolated into the `path` string given at instantiation. Obviously, there must be at least as many as there are placeholders in the `path`. Returns ------- tuple An empty tuple. """ path = Path(self.path.format(*parts).strip()) file = str(path.resolve()) if self.create: path.parent.mkdir(parents=True, exist_ok=True) algo.save(file) return ()
[docs] class TokenizerLoader[T: (Algo, Tokenizer)](ArgRepr): """Load a previously saved Tokenizer or Algo from file. Parameters ---------- algo: Tokenizer or Algo A fresh, trained, or tainted instance of a tokenizer or an Algo. path: str, optional Full or partial path to the model to load. If not fully specified here, it can be completed on calling the instance. Defaults to the current working directory of the python interpreter. """ def __init__(self, algo: T, path: str = '') -> None: self.algo = algo self.path = str(path).strip() super().__init__(self.path)
[docs] def __call__(self, path: str = '') -> T: """Load a previously saved Tokenizer or Algo from file Parameters ---------- path: str, optional Path (including file name) to the file to load. If it starts with a backslash, it will be interpreted as absolute, if not, as relative to the `path` specified at instantiation. Defaults to an empty string, which results in an unchanged `path`. Returns ------- Tokenizer or Algo A new instance of the same type as the `algo` provided at instantiation with its internal parameters set to what was read from file. """ path = Path(self.path) / str(path).strip() file = str(path.resolve()) return self.algo.from_file(file)
# Provide read-to-use instances of both the Save and the Loader save_tokenizer = TokenizerSaver(config.tokenizer_file, True) load_tokenizer = TokenizerLoader(tokenizer, config.tokenizer_file)