Util¤

`util` ¤

utils.py.

This file contains helper functions used by multiple modules.

Last Updated: June 24, 2025 Lasty Tested: June 24, 2025

Functions:

Name	Description
`ensure_list`	Ensure string is converted to a Path.
`ensure_path`	Ensure string is converted to a Path.
`get_encoding`	Use chardet to return the encoding type of a string.
`get_paths`	Get a list paths in a directory.
`get_token_extension_names`	Get the names of token extensions from a spaCy Doc.
`is_valid_colour`	Check if a string is a valid colour.
`load_spacy_model`	Load a spaCy language model.
`normalize`	Normalise a string to LexosFile format.
`normalize_file`	Normalise a file to LexosFile format and save the file.
`normalize_files`	Normalise a list of files to LexosFile format and save the files.
`normalize_strings`	Normalise a list of strings to LexosFile format.
`strip_doc`	Strip leading and normalise trailing whitespace in a spaCy Doc.
`to_collection`	Validate and cast a value or values to a collection.

`ensure_list(item: Any) -> list` ¤

Ensure string is converted to a Path.

Parameters:

Name	Type	Description	Default
`item`	`Any`	Anything.	required

Returns:

Type	Description
`list`	The item inside a list if it is not already a list.

Source code in lexos/util.py

def ensure_list(item: Any) -> list:
    """Ensure string is converted to a Path.

    Args:
        item (Any): Anything.

    Returns:
        The item inside a list if it is not already a list.
    """
    if not isinstance(item, list):
        item = [item]
    return item

`ensure_path(path: Any) -> Any` ¤

Ensure string is converted to a Path.

Parameters:

Name	Type	Description	Default
`path`	`Any`	Anything. If string, it's converted to Path.	required

Returns:

Type	Description
`Any`	Path or original argument.

Source code in lexos/util.py

def ensure_path(path: Any) -> Any:
    """Ensure string is converted to a Path.

    Args:
        path (Any): Anything. If string, it's converted to Path.

    Returns:
        Path or original argument.
    """
    if isinstance(path, str):
        return Path(path.replace("\\", "/"))
    else:
        return path

`get_encoding(input_string: bytes) -> str` ¤

Use chardet to return the encoding type of a string.

Parameters:

Name	Type	Description	Default
`input_string`	`bytes`	A bytestring.	required

Returns:

Type	Description
`str`	The string's encoding type.

Source code in lexos/util.py

def get_encoding(input_string: bytes) -> str:
    """Use chardet to return the encoding type of a string.

    Args:
        input_string (bytes): A bytestring.

    Returns:
        The string's encoding type.
    """
    encoding_detect = chardet.detect(input_string[: constants.MIN_ENCODING_DETECT])
    encoding_type = encoding_detect["encoding"]
    if encoding_type is None:
        encoding_type = "utf-8"
    return encoding_type

`get_paths(path: Path | str) -> list` ¤

Get a list paths in a directory.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the directory.	required

Returns:

Name	Type	Description
`list`	`list`	A list of file paths.

Source code in lexos/util.py

def get_paths(path: Path | str) -> list:
    """Get a list paths in a directory.

    Args:
        path (Path | str): The path to the directory.

    Returns:
        list: A list of file paths.
    """
    return list(Path(path).glob("**/*"))

`get_token_extension_names(doc: Doc) -> list[str]` ¤

Get the names of token extensions from a spaCy Doc.

Parameters:

Name	Type	Description	Default
`doc`	`Doc`	spaCy Doc to analyze.	required

Returns:

Type	Description
`list[str]`	list[str]: a list of token extensions.

Source code in lexos/util.py

def get_token_extension_names(doc: Doc) -> list[str]:
    """Get the names of token extensions from a spaCy Doc.

    Args:
        doc: spaCy Doc to analyze.

    Returns:
        list[str]: a list of token extensions.
    """
    return [ext for ext in doc[0]._.__dict__["_extensions"].keys()]

`is_valid_colour(color: str) -> bool` ¤

Check if a string is a valid colour.

Parameters:

Name	Type	Description	Default
`color`	`str`	A string representing a colour.	required

Returns:

Type	Description
`bool`	True if the string is a valid colour, False otherwise.

Note: Implements Pydantic's Color type for validation. See https://docs.pydantic.dev/2.0/usage/types/extra_types/color_types/ for more information.

Source code in lexos/util.py

def is_valid_colour(color: str) -> bool:
    """Check if a string is a valid colour.

    Args:
        color: A string representing a colour.

    Returns:
        True if the string is a valid colour, False otherwise.

    Note: Implements Pydantic's Color type for validation.
    See https://docs.pydantic.dev/2.0/usage/types/extra_types/color_types/ for more information.
    """
    try:
        Color(color)
    except PydanticCustomError:
        return False
    return True

`load_spacy_model(model: Language | str) -> Language` ¤

Load a spaCy language model.

Parameters:

Name	Type	Description	Default
`model`	`Language \| str`	The spaCy model to load, either as a Language object or a string representing the model name.	required

Returns:

Name	Type	Description
`Language`	`Language`	The loaded spaCy language model.

Raises:

Type	Description
`LexosException`	If the model cannot be loaded or if the model type is incorrect.

Source code in lexos/util.py

def load_spacy_model(model: Language | str) -> Language:
    """Load a spaCy language model.

    Args:
        model (Language | str): The spaCy model to load, either as a Language object or a string representing the model name.

    Returns:
        Language: The loaded spaCy language model.

    Raises:
        LexosException: If the model cannot be loaded or if the model type is incorrect.
    """
    if not isinstance(model, (Language, str)):
        raise LexosException("Model must be a string or a spaCy Language object.")

    if isinstance(model, Language):
        return model
    else:
        try:
            return spacy.load(model)
        except OSError:
            raise LexosException(
                f"Error loading model '{model}'. Please check the name and try again. You may need to install the model on your system."
            )

`normalize(raw_bytes: bytes | str) -> str` ¤

Normalise a string to LexosFile format.

Parameters:

Name	Type	Description	Default
`raw_bytes`	`bytes \| str`	The input bytestring.	required

Returns:

Type	Description
`str`	Normalised version of the input string.

Source code in lexos/util.py

def normalize(raw_bytes: bytes | str) -> str:
    """Normalise a string to LexosFile format.

    Args:
        raw_bytes (bytes | str): The input bytestring.

    Returns:
        Normalised version of the input string.
    """
    s = _decode_bytes(raw_bytes)
    return s

`normalize_file(filepath: Path | str, destination_dir: Path | str = '.') -> None` ¤

Normalise a file to LexosFile format and save the file.

Parameters:

Name	Type	Description	Default
`filepath`	`Path \| str`	The path to the input file.	required
`destination_dir`	`Path \| str`	The path to the directory where the files. will be saved.	`'.'`

Source code in lexos/util.py

def normalize_file(filepath: Path | str, destination_dir: Path | str = ".") -> None:
    """Normalise a file to LexosFile format and save the file.

    Args:
        filepath (Path | str): The path to the input file.
        destination_dir (Path | str): The path to the directory where the files.
            will be saved.
    """
    # filepath = ensure_path(filepath)
    filepath = Path(filepath)
    destination_dir = ensure_path(destination_dir)
    with open(filepath, "rb") as f:
        doc = f.read()
    with open(destination_dir / Path(filepath.name), "w") as f:
        f.write(normalize(doc))

`normalize_files(filepaths: list[Path | str], destination_dir: Path | str = '.') -> None` ¤

Normalise a list of files to LexosFile format and save the files.

Parameters:

Name	Type	Description	Default
`filepaths`	`list[Path \| str]`	The list of paths to input files.	required
`destination_dir`	`Path \| str`	The path to the directory where the files. will be saved.	`'.'`

Source code in lexos/util.py

def normalize_files(
    filepaths: list[Path | str], destination_dir: Path | str = "."
) -> None:
    """Normalise a list of files to LexosFile format and save the files.

    Args:
        filepaths (list[Path | str]): The list of paths to input files.
        destination_dir (Path | str): The path to the directory where the files.
            will be saved.
    """
    for filepath in filepaths:
        filepath = ensure_path(filepath)
        with open(filepath, "rb") as f:
            doc = f.read()
        with open(destination_dir / filepath.name, "w") as f:
            f.write(normalize(doc))

`normalize_strings(strings: list[str]) -> list[str]` ¤

Normalise a list of strings to LexosFile format.

Parameters:

Name	Type	Description	Default
`strings`	`list[Path \| str]`	The list of input strings.	required

Returns:

Type	Description
`list[str]`	A list of normalised versions of the input strings.

Source code in lexos/util.py

def normalize_strings(strings: list[str]) -> list[str]:
    """Normalise a list of strings to LexosFile format.

    Args:
        strings (list[Path | str]): The list of input strings.

    Returns:
        A list of normalised versions of the input strings.
    """
    normalized_strings = []
    for s in strings:
        normalized_strings.append(normalize(s))
    return normalized_strings

`strip_doc(doc: Doc) -> Doc` ¤

Strip leading and normalise trailing whitespace in a spaCy Doc.

Parameters:

Name	Type	Description	Default
`doc`	`Doc`	spaCy Doc to analyze	required

Returns:

Name	Type	Description
`Doc`	`Doc`	the Doc with leading and trailing whitespace removed.

Raises:

Type	Description
`ValueError`	If Doc is empty or contains only whitespace.

If the final token has trailing whitespace, this will be preserved.

You can remove the space with:

```python words = [t.text for t in doc] spaces = [t.whitespace_ for t in doc] spaces[-1] = "" doc = Doc(doc.vocab, words=words, spaces=spaces)

But you will lose all entities and custom extensions. So it makes more sense to call doc.text.strip() when needed instead.

Source code in lexos/util.py

def strip_doc(doc: Doc) -> Doc:
    """Strip leading and normalise trailing whitespace in a spaCy Doc.

    Args:
        doc: spaCy Doc to analyze

    Returns:
        Doc: the Doc with leading and trailing whitespace removed.

    Raises:
        ValueError: If Doc is empty or contains only whitespace.

    Note: If the final token has trailing whitespace, this will be preserved.
          You can remove the space with:

          ```python
          words = [t.text for t in doc]
          spaces = [t.whitespace_ for t in doc]
          spaces[-1] = ""
          doc = Doc(doc.vocab, words=words, spaces=spaces)

          But you will lose all entities and custom extensions. So it makes more
          sense to call doc.text.strip() when needed instead.
    """
    if not doc:
        raise LexosException("Document is empty.")

    # Find first non-whitespace token
    start_idx = 0
    for token in doc:
        if not token.is_space:
            start_idx = token.i
            break

    # Find last non-whitespace token
    end_idx = len(doc) - 1
    for i in range(len(doc) - 1, -1, -1):  # list(doc)[::-1]
        if not doc[i].is_space:
            end_idx = i
            break

    return doc[start_idx : end_idx + 1].as_doc()

`to_collection(val: AnyVal | Collection[AnyVal], val_type: type[Any] | tuple[type[Any], ...], col_type: type[Any]) -> Collection[AnyVal]` ¤

Validate and cast a value or values to a collection.

Parameters:

Name	Type	Description	Default
`val`	`AnyVal \| Collection[AnyVal]`	Value or values to validate and cast.	required
`val_type`	`type[Any] \| tuple[type[Any], ...]`	Type of each value in collection, e.g. `int` or `(str, bytes)`.	required
`col_type`	`type[Any]`	Type of collection to return, e.g. `tuple` or `set`.	required

Returns:

Type	Description
`Collection[AnyVal]`	Collection[AnyVal]: Collection of type `col_type` with values all of type `val_type`.

Raises:

Type	Description
`TypeError`	An invalid value was passed.

Source code in lexos/util.py

def to_collection(
    val: AnyVal | Collection[AnyVal],
    val_type: type[Any] | tuple[type[Any], ...],
    col_type: type[Any],
) -> Collection[AnyVal]:
    """Validate and cast a value or values to a collection.

    Args:
        val (AnyVal | Collection[AnyVal]): Value or values to validate and cast.
        val_type (type[Any] | tuple[type[Any], ...]): Type of each value in collection, e.g. ``int`` or ``(str, bytes)``.
        col_type (type[Any]): Type of collection to return, e.g. ``tuple`` or ``set``.

    Returns:
        Collection[AnyVal]: Collection of type ``col_type`` with values all of type ``val_type``.

    Raises:
        TypeError: An invalid value was passed.
    """
    if val is None:
        return []
    if isinstance(val, val_type):
        return col_type([val])
    elif isinstance(val, (tuple, list, set, frozenset)):
        if not all(isinstance(v, val_type) for v in val):
            raise TypeError(f"not all values are of type {val_type}")
        return col_type(val)
    else:
        # TODO: use standard error message, maybe?
        raise TypeError(
            f"values must be {val_type} or a collection thereof, not {type(val)}"
        )

Util¤

util ¤

ensure_list(item: Any) -> list ¤

ensure_path(path: Any) -> Any ¤

get_encoding(input_string: bytes) -> str ¤

get_paths(path: Path | str) -> list ¤

get_token_extension_names(doc: Doc) -> list[str] ¤

is_valid_colour(color: str) -> bool ¤

load_spacy_model(model: Language | str) -> Language ¤

normalize(raw_bytes: bytes | str) -> str ¤

normalize_file(filepath: Path | str, destination_dir: Path | str = '.') -> None ¤

normalize_files(filepaths: list[Path | str], destination_dir: Path | str = '.') -> None ¤

normalize_strings(strings: list[str]) -> list[str] ¤

strip_doc(doc: Doc) -> Doc ¤

to_collection(val: AnyVal | Collection[AnyVal], val_type: type[Any] | tuple[type[Any], ...], col_type: type[Any]) -> Collection[AnyVal] ¤

`util` ¤

`ensure_list(item: Any) -> list` ¤

`ensure_path(path: Any) -> Any` ¤

`get_encoding(input_string: bytes) -> str` ¤

`get_paths(path: Path | str) -> list` ¤

`get_token_extension_names(doc: Doc) -> list[str]` ¤

`is_valid_colour(color: str) -> bool` ¤

`load_spacy_model(model: Language | str) -> Language` ¤

`normalize(raw_bytes: bytes | str) -> str` ¤

`normalize_file(filepath: Path | str, destination_dir: Path | str = '.') -> None` ¤

`normalize_files(filepaths: list[Path | str], destination_dir: Path | str = '.') -> None` ¤

`normalize_strings(strings: list[str]) -> list[str]` ¤

`strip_doc(doc: Doc) -> Doc` ¤

`to_collection(val: AnyVal | Collection[AnyVal], val_type: type[Any] | tuple[type[Any], ...], col_type: type[Any]) -> Collection[AnyVal]` ¤