Skip to content

Whitespace Counter Tokenizer¤

This class inherits from the main Tokenizer class and extends it by counting runs of spaces and line breaks.

WhitespaceCounter pydantic-model ¤

Bases: Tokenizer

Whitespace tokenizer that captures line breaks and counts runs of spaces.

Fields:

Source code in lexos/tokenizer/whitespace_counter.py
class WhitespaceCounter(Tokenizer):
    """Whitespace tokenizer that captures line breaks and counts runs of spaces."""

    def _get_token_widths(self, text: str) -> tuple[list[str], list[int]]:
        """Get the widths of tokens in a doc.

        Args:
            text (str): The input text.

        Returns:
            tuple[list[str], list[int]]: A tuple containing the tokens and widths.
        """
        # Pattern: words, line breaks, or runs of spaces
        pattern = re.compile(r"([^\s\n]+)|(\n)|([ ]{2,})|([ ])")
        tokens = []
        widths = []
        for match in pattern.finditer(text):
            word, newline, multi_space, single_space = match.groups()
            if word:
                tokens.append(word)
                widths.append(len(word))  # Use number of characters in word
            elif newline:
                tokens.append("\n")
                widths.append(1)  # Use 1 to indicate a line break
            elif multi_space:
                tokens.append(" ")
                widths.append(len(multi_space))
            elif single_space:
                tokens.append(" ")
                widths.append(1)
        return tokens, widths

    @validate_call
    def make_doc(
        self, text: str, max_length: int = None, disable: list[str] = []
    ) -> Doc:
        """Return a doc from a text.

        Args:
            text (str): The text to be parsed.
            max_length (int): The maximum length of the doc.
            disable (list[str]): A list of spaCy pipeline components to disable.

        Returns:
            Doc: A spaCy doc object.
        """
        # Override instance settings with keyword arguments
        if max_length:
            self.max_length = max_length
            self.nlp.max_length = max_length
        if disable:
            self.nlp.select_pipes(disable=disable)
        tokens, widths = self._get_token_widths(text)
        if not Token.has_extension("width"):
            Token.set_extension("width", default=0)
        doc = Doc(self.nlp.vocab, words=tokens)
        for token, count in zip(doc, widths):
            token._.width = count
        # Apply pipeline components manually, skipping those in 'disable'
        for name, proc in self.nlp.pipeline:
            if name not in disable:
                doc = proc(doc)
        return doc

    @validate_call
    def make_docs(
        self,
        texts: Iterable[str],
        max_length: int = None,
        disable: Iterable[str] = [],
        chunk_size: int = 1000,
    ) -> Iterable[Doc]:
        """Return a generator of docs from an iterable of texts, processing in chunks.

        Args:
            texts (Iterable[str]): The texts to process.
            max_length (int, optional): Maximum doc length.
            disable (Iterable[str], optional): Pipeline components to disable.
            chunk_size (int, optional): Number of docs to process per chunk.

        Yields:
            Doc: spaCy Doc objects.
        """
        if max_length:
            self.max_length = max_length
            self.nlp.max_length = max_length

        if not Token.has_extension("width"):
            Token.set_extension("width", default=0)
        enabled_pipes = [
            (name, proc) for name, proc in self.nlp.pipeline if name not in disable
        ]

        def chunker(iterable, size):
            chunk = []
            for item in iterable:
                chunk.append(item)
                if len(chunk) == size:
                    yield chunk
                    chunk = []
            if chunk:
                yield chunk

        for text_chunk in chunker(texts, chunk_size):
            docs = []
            for text in text_chunk:
                tokens, widths = self._get_token_widths(text)
                doc = Doc(self.nlp.vocab, words=tokens)
                for token, count in zip(doc, widths):
                    token._.width = count
                docs.append(doc)
            for _, proc in enabled_pipes:
                docs = [proc(doc) for doc in docs]
            yield from docs

components: list[str] property ¤

Return the spaCy pipeline components.

disable: Optional[list[str]] = [] pydantic-field ¤

A list of spaCy pipeline components to disable.

disabled: list[str] property ¤

Return the disabled spaCy pipeline components.

max_length: Optional[int] = 2000000 pydantic-field ¤

The maximum length of the doc.

model: Optional[str] = 'xx_sent_ud_sm' pydantic-field ¤

The name of the spaCy model to be used for tokenization.

pipeline: list[str] property ¤

Return the spaCy pipeline components.

stopwords: Optional[list[str] | str] = [] pydantic-field ¤

A list of stop words to apply to docs.

__call__(texts: str | Iterable[str]) -> Doc | Iterable[Doc] ¤

Tokenize a string or an iterable of strings.

Parameters:

Name Type Description Default
texts str | Iterable[str]

The text(s) to be tokenized.

required

Returns:

Type Description
Doc | Iterable[Doc]

Doc | Iterable[Doc]: The tokenized doc(s).

Source code in lexos/tokenizer/__init__.py
@validate_call
def __call__(self, texts: str | Iterable[str]) -> Doc | Iterable[Doc]:
    """Tokenize a string or an iterable of strings.

    Args:
        texts (str | Iterable[str]): The text(s) to be tokenized.

    Returns:
        Doc | Iterable[Doc]: The tokenized doc(s).
    """
    if isinstance(texts, str):
        return self.make_doc(texts)
    elif isinstance(texts, Iterable):
        return self.make_docs(texts)

__init__(**data) -> None ¤

Initialise the Tokenizer class.

Source code in lexos/tokenizer/__init__.py
def __init__(self, **data) -> None:
    """Initialise the Tokenizer class."""
    super().__init__(**data)
    try:
        self.nlp = spacy.load(self.model)
        self.nlp.max_length = self.max_length
    except OSError:
        raise LexosException(
            f"Error loading model {self.model}. Please check the name and try again. You may need to install the model on your system."
        )

add_extension(name: str, default: str) -> None ¤

Add an extension to the spaCy Token class.

Parameters:

Name Type Description Default
name str

The name of the extension.

required
default str

The default value of the extension.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def add_extension(self, name: str, default: str) -> None:
    """Add an extension to the spaCy Token class.

    Args:
        name (str): The name of the extension.
        default (str): The default value of the extension.
    """
    if not Token.has_extension(name):
        Token.set_extension(name, default=default, force=True)

add_stopwords(stopwords: str | list[str]) -> None ¤

Add stopwords to the tokenizer.

Parameters:

Name Type Description Default
stopwords str | Iterable[str]

A list of stopwords to add to the model.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def add_stopwords(self, stopwords: str | list[str]) -> None:
    """Add stopwords to the tokenizer.

    Args:
        stopwords (str | Iterable[str]): A list of stopwords to add to the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = True
    self.stopwords.extend(stopwords)

make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc ¤

Return a doc from a text.

Parameters:

Name Type Description Default
text str

The text to be parsed.

required
max_length int

The maximum length of the doc.

None
disable list[str]

A list of spaCy pipeline components to disable.

[]

Returns:

Name Type Description
Doc Doc

A spaCy doc object.

Source code in lexos/tokenizer/whitespace_counter.py
@validate_call
def make_doc(
    self, text: str, max_length: int = None, disable: list[str] = []
) -> Doc:
    """Return a doc from a text.

    Args:
        text (str): The text to be parsed.
        max_length (int): The maximum length of the doc.
        disable (list[str]): A list of spaCy pipeline components to disable.

    Returns:
        Doc: A spaCy doc object.
    """
    # Override instance settings with keyword arguments
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length
    if disable:
        self.nlp.select_pipes(disable=disable)
    tokens, widths = self._get_token_widths(text)
    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    doc = Doc(self.nlp.vocab, words=tokens)
    for token, count in zip(doc, widths):
        token._.width = count
    # Apply pipeline components manually, skipping those in 'disable'
    for name, proc in self.nlp.pipeline:
        if name not in disable:
            doc = proc(doc)
    return doc

make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc] ¤

Return a generator of docs from an iterable of texts, processing in chunks.

Parameters:

Name Type Description Default
texts Iterable[str]

The texts to process.

required
max_length int

Maximum doc length.

None
disable Iterable[str]

Pipeline components to disable.

[]
chunk_size int

Number of docs to process per chunk.

1000

Yields:

Name Type Description
Doc Iterable[Doc]

spaCy Doc objects.

Source code in lexos/tokenizer/whitespace_counter.py
@validate_call
def make_docs(
    self,
    texts: Iterable[str],
    max_length: int = None,
    disable: Iterable[str] = [],
    chunk_size: int = 1000,
) -> Iterable[Doc]:
    """Return a generator of docs from an iterable of texts, processing in chunks.

    Args:
        texts (Iterable[str]): The texts to process.
        max_length (int, optional): Maximum doc length.
        disable (Iterable[str], optional): Pipeline components to disable.
        chunk_size (int, optional): Number of docs to process per chunk.

    Yields:
        Doc: spaCy Doc objects.
    """
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length

    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    enabled_pipes = [
        (name, proc) for name, proc in self.nlp.pipeline if name not in disable
    ]

    def chunker(iterable, size):
        chunk = []
        for item in iterable:
            chunk.append(item)
            if len(chunk) == size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

    for text_chunk in chunker(texts, chunk_size):
        docs = []
        for text in text_chunk:
            tokens, widths = self._get_token_widths(text)
            doc = Doc(self.nlp.vocab, words=tokens)
            for token, count in zip(doc, widths):
                token._.width = count
            docs.append(doc)
        for _, proc in enabled_pipes:
            docs = [proc(doc) for doc in docs]
        yield from docs

remove_extension(name: str) -> None ¤

Remove an extension from the spaCy Token class.

Parameters:

Name Type Description Default
name str

The name of the extension.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def remove_extension(self, name: str) -> None:
    """Remove an extension from the spaCy Token class.

    Args:
        name (str): The name of the extension.
    """
    if Token.has_extension(name):
        Token.remove_extension(name)

remove_stopwords(stopwords: str | list[str]) -> None ¤

Remove stopwords from the tokenizer.

Parameters:

Name Type Description Default
stopwords str | list[str]

A list of stopwords to remove from the model.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def remove_stopwords(self, stopwords: str | list[str]) -> None:
    """Remove stopwords from the tokenizer.

    Args:
        stopwords (str | list[str]): A list of stopwords to remove from the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = False
    self.stopwords = [word for word in self.stopwords if word not in stopwords]

model: Optional[str] = 'xx_sent_ud_sm' pydantic-field ¤

The name of the spaCy model to be used for tokenization.

max_length: Optional[int] = 2000000 pydantic-field ¤

The maximum length of the doc.

disable: Optional[list[str]] = [] pydantic-field ¤

A list of spaCy pipeline components to disable.

stopwords: Optional[list[str] | str] = [] pydantic-field ¤

A list of stop words to apply to docs.

nlp: Optional[Language] pydantic-field ¤

model_config = ConfigDict(arbitrary_types_allowed=True, json_schema_extra=(DocJSONSchema.schema()), validate_assignment=True) class-attribute instance-attribute ¤

__init__(**data) -> None ¤

Initialise the Tokenizer class.

Source code in lexos/tokenizer/__init__.py
def __init__(self, **data) -> None:
    """Initialise the Tokenizer class."""
    super().__init__(**data)
    try:
        self.nlp = spacy.load(self.model)
        self.nlp.max_length = self.max_length
    except OSError:
        raise LexosException(
            f"Error loading model {self.model}. Please check the name and try again. You may need to install the model on your system."
        )

__call__(texts: str | Iterable[str]) -> Doc | Iterable[Doc] ¤

Tokenize a string or an iterable of strings.

Parameters:

Name Type Description Default
texts str | Iterable[str]

The text(s) to be tokenized.

required

Returns:

Type Description
Doc | Iterable[Doc]

Doc | Iterable[Doc]: The tokenized doc(s).

Source code in lexos/tokenizer/__init__.py
@validate_call
def __call__(self, texts: str | Iterable[str]) -> Doc | Iterable[Doc]:
    """Tokenize a string or an iterable of strings.

    Args:
        texts (str | Iterable[str]): The text(s) to be tokenized.

    Returns:
        Doc | Iterable[Doc]: The tokenized doc(s).
    """
    if isinstance(texts, str):
        return self.make_doc(texts)
    elif isinstance(texts, Iterable):
        return self.make_docs(texts)

pipeline: list[str] property ¤

Return the spaCy pipeline components.

components: list[str] property ¤

Return the spaCy pipeline components.

disabled: list[str] property ¤

Return the disabled spaCy pipeline components.

_get_token_widths(text: str) -> tuple[list[str], list[int]] ¤

Get the widths of tokens in a doc.

Parameters:

Name Type Description Default
text str

The input text.

required

Returns:

Type Description
tuple[list[str], list[int]]

tuple[list[str], list[int]]: A tuple containing the tokens and widths.

Source code in lexos/tokenizer/whitespace_counter.py
def _get_token_widths(self, text: str) -> tuple[list[str], list[int]]:
    """Get the widths of tokens in a doc.

    Args:
        text (str): The input text.

    Returns:
        tuple[list[str], list[int]]: A tuple containing the tokens and widths.
    """
    # Pattern: words, line breaks, or runs of spaces
    pattern = re.compile(r"([^\s\n]+)|(\n)|([ ]{2,})|([ ])")
    tokens = []
    widths = []
    for match in pattern.finditer(text):
        word, newline, multi_space, single_space = match.groups()
        if word:
            tokens.append(word)
            widths.append(len(word))  # Use number of characters in word
        elif newline:
            tokens.append("\n")
            widths.append(1)  # Use 1 to indicate a line break
        elif multi_space:
            tokens.append(" ")
            widths.append(len(multi_space))
        elif single_space:
            tokens.append(" ")
            widths.append(1)
    return tokens, widths

add_extension(name: str, default: str) -> None ¤

Add an extension to the spaCy Token class.

Parameters:

Name Type Description Default
name str

The name of the extension.

required
default str

The default value of the extension.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def add_extension(self, name: str, default: str) -> None:
    """Add an extension to the spaCy Token class.

    Args:
        name (str): The name of the extension.
        default (str): The default value of the extension.
    """
    if not Token.has_extension(name):
        Token.set_extension(name, default=default, force=True)

add_stopwords(stopwords: str | list[str]) -> None ¤

Add stopwords to the tokenizer.

Parameters:

Name Type Description Default
stopwords str | Iterable[str]

A list of stopwords to add to the model.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def add_stopwords(self, stopwords: str | list[str]) -> None:
    """Add stopwords to the tokenizer.

    Args:
        stopwords (str | Iterable[str]): A list of stopwords to add to the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = True
    self.stopwords.extend(stopwords)

make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc ¤

Return a doc from a text.

Parameters:

Name Type Description Default
text str

The text to be parsed.

required
max_length int

The maximum length of the doc.

None
disable list[str]

A list of spaCy pipeline components to disable.

[]

Returns:

Name Type Description
Doc Doc

A spaCy doc object.

Source code in lexos/tokenizer/whitespace_counter.py
@validate_call
def make_doc(
    self, text: str, max_length: int = None, disable: list[str] = []
) -> Doc:
    """Return a doc from a text.

    Args:
        text (str): The text to be parsed.
        max_length (int): The maximum length of the doc.
        disable (list[str]): A list of spaCy pipeline components to disable.

    Returns:
        Doc: A spaCy doc object.
    """
    # Override instance settings with keyword arguments
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length
    if disable:
        self.nlp.select_pipes(disable=disable)
    tokens, widths = self._get_token_widths(text)
    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    doc = Doc(self.nlp.vocab, words=tokens)
    for token, count in zip(doc, widths):
        token._.width = count
    # Apply pipeline components manually, skipping those in 'disable'
    for name, proc in self.nlp.pipeline:
        if name not in disable:
            doc = proc(doc)
    return doc

make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc] ¤

Return a generator of docs from an iterable of texts, processing in chunks.

Parameters:

Name Type Description Default
texts Iterable[str]

The texts to process.

required
max_length int

Maximum doc length.

None
disable Iterable[str]

Pipeline components to disable.

[]
chunk_size int

Number of docs to process per chunk.

1000

Yields:

Name Type Description
Doc Iterable[Doc]

spaCy Doc objects.

Source code in lexos/tokenizer/whitespace_counter.py
@validate_call
def make_docs(
    self,
    texts: Iterable[str],
    max_length: int = None,
    disable: Iterable[str] = [],
    chunk_size: int = 1000,
) -> Iterable[Doc]:
    """Return a generator of docs from an iterable of texts, processing in chunks.

    Args:
        texts (Iterable[str]): The texts to process.
        max_length (int, optional): Maximum doc length.
        disable (Iterable[str], optional): Pipeline components to disable.
        chunk_size (int, optional): Number of docs to process per chunk.

    Yields:
        Doc: spaCy Doc objects.
    """
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length

    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    enabled_pipes = [
        (name, proc) for name, proc in self.nlp.pipeline if name not in disable
    ]

    def chunker(iterable, size):
        chunk = []
        for item in iterable:
            chunk.append(item)
            if len(chunk) == size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

    for text_chunk in chunker(texts, chunk_size):
        docs = []
        for text in text_chunk:
            tokens, widths = self._get_token_widths(text)
            doc = Doc(self.nlp.vocab, words=tokens)
            for token, count in zip(doc, widths):
                token._.width = count
            docs.append(doc)
        for _, proc in enabled_pipes:
            docs = [proc(doc) for doc in docs]
        yield from docs

remove_extension(name: str) -> None ¤

Remove an extension from the spaCy Token class.

Parameters:

Name Type Description Default
name str

The name of the extension.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def remove_extension(self, name: str) -> None:
    """Remove an extension from the spaCy Token class.

    Args:
        name (str): The name of the extension.
    """
    if Token.has_extension(name):
        Token.remove_extension(name)

remove_stopwords(stopwords: str | list[str]) -> None ¤

Remove stopwords from the tokenizer.

Parameters:

Name Type Description Default
stopwords str | list[str]

A list of stopwords to remove from the model.

required
Source code in lexos/tokenizer/__init__.py
@validate_call
def remove_stopwords(self, stopwords: str | list[str]) -> None:
    """Remove stopwords from the tokenizer.

    Args:
        stopwords (str | list[str]): A list of stopwords to remove from the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = False
    self.stopwords = [word for word in self.stopwords if word not in stopwords]