Skip to content

Util¤

util ¤

utils.py.

This file contains helper functions used by multiple modules.

Last Updated: June 24, 2025 Lasty Tested: June 24, 2025

Functions:

Name Description
ensure_list

Ensure string is converted to a Path.

ensure_path

Ensure string is converted to a Path.

get_encoding

Use chardet to return the encoding type of a string.

get_paths

Get a list paths in a directory.

get_token_extension_names

Get the names of token extensions from a spaCy Doc.

is_valid_colour

Check if a string is a valid colour.

load_spacy_model

Load a spaCy language model.

normalize

Normalise a string to LexosFile format.

normalize_file

Normalise a file to LexosFile format and save the file.

normalize_files

Normalise a list of files to LexosFile format and save the files.

normalize_strings

Normalise a list of strings to LexosFile format.

strip_doc

Strip leading and normalise trailing whitespace in a spaCy Doc.

to_collection

Validate and cast a value or values to a collection.

ensure_list(item: Any) -> list ¤

Ensure string is converted to a Path.

Parameters:

Name Type Description Default
item Any

Anything.

required

Returns:

Type Description
list

The item inside a list if it is not already a list.

Source code in lexos/util.py
def ensure_list(item: Any) -> list:
    """Ensure string is converted to a Path.

    Args:
        item (Any): Anything.

    Returns:
        The item inside a list if it is not already a list.
    """
    if not isinstance(item, list):
        item = [item]
    return item

ensure_path(path: Any) -> Any ¤

Ensure string is converted to a Path.

Parameters:

Name Type Description Default
path Any

Anything. If string, it's converted to Path.

required

Returns:

Type Description
Any

Path or original argument.

Source code in lexos/util.py
def ensure_path(path: Any) -> Any:
    """Ensure string is converted to a Path.

    Args:
        path (Any): Anything. If string, it's converted to Path.

    Returns:
        Path or original argument.
    """
    if isinstance(path, str):
        return Path(path.replace("\\", "/"))
    else:
        return path

get_encoding(input_string: bytes) -> str ¤

Use chardet to return the encoding type of a string.

Parameters:

Name Type Description Default
input_string bytes

A bytestring.

required

Returns:

Type Description
str

The string's encoding type.

Source code in lexos/util.py
def get_encoding(input_string: bytes) -> str:
    """Use chardet to return the encoding type of a string.

    Args:
        input_string (bytes): A bytestring.

    Returns:
        The string's encoding type.
    """
    encoding_detect = chardet.detect(input_string[: constants.MIN_ENCODING_DETECT])
    encoding_type = encoding_detect["encoding"]
    if encoding_type is None:
        encoding_type = "utf-8"
    return encoding_type

get_paths(path: Path | str) -> list ¤

Get a list paths in a directory.

Parameters:

Name Type Description Default
path Path | str

The path to the directory.

required

Returns:

Name Type Description
list list

A list of file paths.

Source code in lexos/util.py
def get_paths(path: Path | str) -> list:
    """Get a list paths in a directory.

    Args:
        path (Path | str): The path to the directory.

    Returns:
        list: A list of file paths.
    """
    return list(Path(path).glob("**/*"))

get_token_extension_names(doc: Doc) -> list[str] ¤

Get the names of token extensions from a spaCy Doc.

Parameters:

Name Type Description Default
doc Doc

spaCy Doc to analyze.

required

Returns:

Type Description
list[str]

list[str]: a list of token extensions.

Source code in lexos/util.py
def get_token_extension_names(doc: Doc) -> list[str]:
    """Get the names of token extensions from a spaCy Doc.

    Args:
        doc: spaCy Doc to analyze.

    Returns:
        list[str]: a list of token extensions.
    """
    return [ext for ext in doc[0]._.__dict__["_extensions"].keys()]

is_valid_colour(color: str) -> bool ¤

Check if a string is a valid colour.

Parameters:

Name Type Description Default
color str

A string representing a colour.

required

Returns:

Type Description
bool

True if the string is a valid colour, False otherwise.

Note: Implements Pydantic's Color type for validation. See https://docs.pydantic.dev/2.0/usage/types/extra_types/color_types/ for more information.

Source code in lexos/util.py
def is_valid_colour(color: str) -> bool:
    """Check if a string is a valid colour.

    Args:
        color: A string representing a colour.

    Returns:
        True if the string is a valid colour, False otherwise.

    Note: Implements Pydantic's Color type for validation.
    See https://docs.pydantic.dev/2.0/usage/types/extra_types/color_types/ for more information.
    """
    try:
        Color(color)
    except PydanticCustomError:
        return False
    return True

load_spacy_model(model: Language | str) -> Language ¤

Load a spaCy language model.

Parameters:

Name Type Description Default
model Language | str

The spaCy model to load, either as a Language object or a string representing the model name.

required

Returns:

Name Type Description
Language Language

The loaded spaCy language model.

Raises:

Type Description
LexosException

If the model cannot be loaded or if the model type is incorrect.

Source code in lexos/util.py
def load_spacy_model(model: Language | str) -> Language:
    """Load a spaCy language model.

    Args:
        model (Language | str): The spaCy model to load, either as a Language object or a string representing the model name.

    Returns:
        Language: The loaded spaCy language model.

    Raises:
        LexosException: If the model cannot be loaded or if the model type is incorrect.
    """
    if not isinstance(model, (Language, str)):
        raise LexosException("Model must be a string or a spaCy Language object.")

    if isinstance(model, Language):
        return model
    else:
        try:
            return spacy.load(model)
        except OSError:
            raise LexosException(
                f"Error loading model '{model}'. Please check the name and try again. You may need to install the model on your system."
            )

normalize(raw_bytes: bytes | str) -> str ¤

Normalise a string to LexosFile format.

Parameters:

Name Type Description Default
raw_bytes bytes | str

The input bytestring.

required

Returns:

Type Description
str

Normalised version of the input string.

Source code in lexos/util.py
def normalize(raw_bytes: bytes | str) -> str:
    """Normalise a string to LexosFile format.

    Args:
        raw_bytes (bytes | str): The input bytestring.

    Returns:
        Normalised version of the input string.
    """
    s = _decode_bytes(raw_bytes)
    return s

normalize_file(filepath: Path | str, destination_dir: Path | str = '.') -> None ¤

Normalise a file to LexosFile format and save the file.

Parameters:

Name Type Description Default
filepath Path | str

The path to the input file.

required
destination_dir Path | str

The path to the directory where the files. will be saved.

'.'
Source code in lexos/util.py
def normalize_file(filepath: Path | str, destination_dir: Path | str = ".") -> None:
    """Normalise a file to LexosFile format and save the file.

    Args:
        filepath (Path | str): The path to the input file.
        destination_dir (Path | str): The path to the directory where the files.
            will be saved.
    """
    # filepath = ensure_path(filepath)
    filepath = Path(filepath)
    destination_dir = ensure_path(destination_dir)
    with open(filepath, "rb") as f:
        doc = f.read()
    with open(destination_dir / Path(filepath.name), "w") as f:
        f.write(normalize(doc))

normalize_files(filepaths: list[Path | str], destination_dir: Path | str = '.') -> None ¤

Normalise a list of files to LexosFile format and save the files.

Parameters:

Name Type Description Default
filepaths list[Path | str]

The list of paths to input files.

required
destination_dir Path | str

The path to the directory where the files. will be saved.

'.'
Source code in lexos/util.py
def normalize_files(
    filepaths: list[Path | str], destination_dir: Path | str = "."
) -> None:
    """Normalise a list of files to LexosFile format and save the files.

    Args:
        filepaths (list[Path | str]): The list of paths to input files.
        destination_dir (Path | str): The path to the directory where the files.
            will be saved.
    """
    for filepath in filepaths:
        filepath = ensure_path(filepath)
        with open(filepath, "rb") as f:
            doc = f.read()
        with open(destination_dir / filepath.name, "w") as f:
            f.write(normalize(doc))

normalize_strings(strings: list[str]) -> list[str] ¤

Normalise a list of strings to LexosFile format.

Parameters:

Name Type Description Default
strings list[Path | str]

The list of input strings.

required

Returns:

Type Description
list[str]

A list of normalised versions of the input strings.

Source code in lexos/util.py
def normalize_strings(strings: list[str]) -> list[str]:
    """Normalise a list of strings to LexosFile format.

    Args:
        strings (list[Path | str]): The list of input strings.

    Returns:
        A list of normalised versions of the input strings.
    """
    normalized_strings = []
    for s in strings:
        normalized_strings.append(normalize(s))
    return normalized_strings

strip_doc(doc: Doc) -> Doc ¤

Strip leading and normalise trailing whitespace in a spaCy Doc.

Parameters:

Name Type Description Default
doc Doc

spaCy Doc to analyze

required

Returns:

Name Type Description
Doc Doc

the Doc with leading and trailing whitespace removed.

Raises:

Type Description
ValueError

If Doc is empty or contains only whitespace.

If the final token has trailing whitespace, this will be preserved.

You can remove the space with:

```python words = [t.text for t in doc] spaces = [t.whitespace_ for t in doc] spaces[-1] = "" doc = Doc(doc.vocab, words=words, spaces=spaces)

But you will lose all entities and custom extensions. So it makes more sense to call doc.text.strip() when needed instead.

Source code in lexos/util.py
def strip_doc(doc: Doc) -> Doc:
    """Strip leading and normalise trailing whitespace in a spaCy Doc.

    Args:
        doc: spaCy Doc to analyze

    Returns:
        Doc: the Doc with leading and trailing whitespace removed.

    Raises:
        ValueError: If Doc is empty or contains only whitespace.

    Note: If the final token has trailing whitespace, this will be preserved.
          You can remove the space with:

          ```python
          words = [t.text for t in doc]
          spaces = [t.whitespace_ for t in doc]
          spaces[-1] = ""
          doc = Doc(doc.vocab, words=words, spaces=spaces)

          But you will lose all entities and custom extensions. So it makes more
          sense to call doc.text.strip() when needed instead.
    """
    if not doc:
        raise LexosException("Document is empty.")

    # Find first non-whitespace token
    start_idx = 0
    for token in doc:
        if not token.is_space:
            start_idx = token.i
            break

    # Find last non-whitespace token
    end_idx = len(doc) - 1
    for i in range(len(doc) - 1, -1, -1):  # list(doc)[::-1]
        if not doc[i].is_space:
            end_idx = i
            break

    return doc[start_idx : end_idx + 1].as_doc()

to_collection(val: AnyVal | Collection[AnyVal], val_type: type[Any] | tuple[type[Any], ...], col_type: type[Any]) -> Collection[AnyVal] ¤

Validate and cast a value or values to a collection.

Parameters:

Name Type Description Default
val AnyVal | Collection[AnyVal]

Value or values to validate and cast.

required
val_type type[Any] | tuple[type[Any], ...]

Type of each value in collection, e.g. int or (str, bytes).

required
col_type type[Any]

Type of collection to return, e.g. tuple or set.

required

Returns:

Type Description
Collection[AnyVal]

Collection[AnyVal]: Collection of type col_type with values all of type val_type.

Raises:

Type Description
TypeError

An invalid value was passed.

Source code in lexos/util.py
def to_collection(
    val: AnyVal | Collection[AnyVal],
    val_type: type[Any] | tuple[type[Any], ...],
    col_type: type[Any],
) -> Collection[AnyVal]:
    """Validate and cast a value or values to a collection.

    Args:
        val (AnyVal | Collection[AnyVal]): Value or values to validate and cast.
        val_type (type[Any] | tuple[type[Any], ...]): Type of each value in collection, e.g. ``int`` or ``(str, bytes)``.
        col_type (type[Any]): Type of collection to return, e.g. ``tuple`` or ``set``.

    Returns:
        Collection[AnyVal]: Collection of type ``col_type`` with values all of type ``val_type``.

    Raises:
        TypeError: An invalid value was passed.
    """
    if val is None:
        return []
    if isinstance(val, val_type):
        return col_type([val])
    elif isinstance(val, (tuple, list, set, frozenset)):
        if not all(isinstance(v, val_type) for v in val):
            raise TypeError(f"not all values are of type {val_type}")
        return col_type(val)
    else:
        # TODO: use standard error message, maybe?
        raise TypeError(
            f"values must be {val_type} or a collection thereof, not {type(val)}"
        )