Source code for slangmod.io.corpus

import warnings
from collections.abc import Iterable, Callable
from itertools import chain
from pathlib import Path
from swak.misc import ArgRepr
from swak.text import NotFound, LiteralNotFound
from swak.funcflow import Filter
from ..config import config
from .files import read_column

__all__ = [
    'NotFound',
    'CorpusDiscovery',
    'CorpusFilter',
    'CorpusLoader',
    'load_corpus',
    'discover_raw',
    'discover_corpus',
    'discover_encodings',
    'filter_train_files',
    'filter_test_files',
    'filter_validation_files'
]


[docs] class CorpusDiscovery(ArgRepr): """Discover files in a given directory and filter by name and suffix. Parameters ---------- folder: str, optional Parent directory to search for files. Subdirectories can be specified when calling instances. Defaults to the working directory of the current python interpreter. *file_types: str, optional File names must contain at least one of these strings. suffix: str, optional Extension glob pattern that files must match (without leading dot). Defaults to "parquet". not_found: str, optional What to do if either the directory does not exist or no matching files are found in the given directory. One of "ignore", "warn", or "raise". Use the `NotFound <https://yedivanseven.github.io/swak/text.html#swak. text.misc.NotFound>`_ enum to avoid typos. Defaults to "raise". If set otherwise, an empty tuple of file names might be returned. """ def __init__( self, folder: str = '', *file_types: str, suffix: str = 'parquet', not_found: NotFound | LiteralNotFound = NotFound.RAISE, ) -> None: self.folder = str(folder).strip() self.types = tuple( str(file_type).strip() for file_type in file_types ) if file_types else ('',) self.suffix = str(suffix).strip(' .') self.not_found = str(not_found).strip().lower() super().__init__( self.folder, *self.types, suffix=self.suffix, not_found=self.not_found )
[docs] def __call__(self, subfolder: str = '') -> list[str]: """Chose subdirectory and filter names of files found therein. Parameters ---------- subfolder: str, optional Subdirectory relative to the parent given at instantiation. Defaults to an empty string, resulting in the that parent directory to be searched. Returns ------- list Fully resolved names of files that match the given criteria from within the specified directory. Raises ------ FileNotFoundError Only if `not_found` is set to "raise", and then only if either the directory was not found or no files matching the specified criteria were found in that directory. """ path = Path(self.folder) / str(subfolder).strip() corpus = [ str(item.resolve()) for item in path.glob(f'*.{self.suffix}') if item.is_file() and any(prefix in item.name for prefix in self.types) ] if path.exists() and path.is_dir() else [] if corpus: return corpus # If no files were found, act according to the not_found flag template = 'No *.{} files with any of {} in their name in folder "{}"!' msg = template.format(self.suffix, self.types, path.resolve()) match self.not_found: case NotFound.WARN: warnings.warn(msg) case NotFound.RAISE: raise FileNotFoundError(msg) return corpus
[docs] class CorpusLoader(ArgRepr): """Read files with multiple documents and provide an iterator over all. Parameters ---------- reader: callable Must return some sort of iterable over documents (=strings), when given a file name. """ def __init__(self, reader: Callable[[str], Iterable[str]]) -> None: super().__init__(reader) self.reader = reader
[docs] def __call__(self, files: Iterable[str]) -> chain[str]: """Read files with multiple documents and provide an iterator over all. Parameters ---------- files: iterable over str Names of files to chain documents from. Returns ------- Iterator An ``itertools.chain`` iterator over all documents from all files. """ return chain.from_iterable(map(self.reader, files))
[docs] class CorpusFilter(ArgRepr): """Determine whether a given string is part of a fully resolved file name. Parameters ---------- part: str Part of the file name to filter for. """ def __init__(self, part: str) -> None: self.part = part super().__init__(self.part)
[docs] def __call__(self, file: str) -> bool: """Determine whether the cached string is part of the file name. Parameters ---------- file: str Name of the file to test. Can include parent folder(s). Returns ------- bool Whether the cached `part` occurs in the file name at least once. """ return self.part in Path(file).name
# Provide ready-to-use instances of the CorpusDiscovery discover_raw = CorpusDiscovery( config.files.raw, *config.files.types, suffix=config.files.suffix, not_found=NotFound.RAISE ) discover_corpus = CorpusDiscovery( config.corpus, *config.files.types, suffix=config.files.suffix, not_found=NotFound.RAISE ) discover_encodings = CorpusDiscovery( config.encodings, *config.files.types, suffix=config.files.suffix, not_found=NotFound.RAISE ) # Provide a ready-to-use instance of the CorpusLoader load_corpus = CorpusLoader(read_column) # Provide ready-to-use instances of the CorpusFilter ... train_files_filter = CorpusFilter(config.files.train) test_files_filter = CorpusFilter(config.files.test) validation_files_filter = CorpusFilter(config.files.validation) # ... and the actual filters filter_train_files = Filter[str, list](train_files_filter) filter_test_files = Filter[str, list](test_files_filter) filter_validation_files = Filter[str, list](validation_files_filter)