Whitespace Counter Tokenizer¤

This class inherits from the main Tokenizer class and extends it by counting runs of spaces and line breaks.

`WhitespaceCounter` `pydantic-model` ¤

Bases: Tokenizer

Whitespace tokenizer that captures line breaks and counts runs of spaces.

Fields:

model (Optional[str])
max_length (Optional[int])
disable (Optional[list[str]])
stopwords (Optional[list[str] | str])
nlp (Optional[Language])

Source code in lexos/tokenizer/whitespace_counter.py

class WhitespaceCounter(Tokenizer):
    """Whitespace tokenizer that captures line breaks and counts runs of spaces."""

    def _get_token_widths(self, text: str) -> tuple[list[str], list[int]]:
        """Get the widths of tokens in a doc.

        Args:
            text (str): The input text.

        Returns:
            tuple[list[str], list[int]]: A tuple containing the tokens and widths.
        """
        # Pattern: words, line breaks, or runs of spaces
        pattern = re.compile(r"([^\s\n]+)|(\n)|([ ]{2,})|([ ])")
        tokens = []
        widths = []
        for match in pattern.finditer(text):
            word, newline, multi_space, single_space = match.groups()
            if word:
                tokens.append(word)
                widths.append(len(word))  # Use number of characters in word
            elif newline:
                tokens.append("\n")
                widths.append(1)  # Use 1 to indicate a line break
            elif multi_space:
                tokens.append(" ")
                widths.append(len(multi_space))
            elif single_space:
                tokens.append(" ")
                widths.append(1)
        return tokens, widths

    @validate_call
    def make_doc(
        self, text: str, max_length: int = None, disable: list[str] = []
    ) -> Doc:
        """Return a doc from a text.

        Args:
            text (str): The text to be parsed.
            max_length (int): The maximum length of the doc.
            disable (list[str]): A list of spaCy pipeline components to disable.

        Returns:
            Doc: A spaCy doc object.
        """
        # Override instance settings with keyword arguments
        if max_length:
            self.max_length = max_length
            self.nlp.max_length = max_length
        if disable:
            self.nlp.select_pipes(disable=disable)
        tokens, widths = self._get_token_widths(text)
        if not Token.has_extension("width"):
            Token.set_extension("width", default=0)
        doc = Doc(self.nlp.vocab, words=tokens)
        for token, count in zip(doc, widths):
            token._.width = count
        # Apply pipeline components manually, skipping those in 'disable'
        for name, proc in self.nlp.pipeline:
            if name not in disable:
                doc = proc(doc)
        return doc

    @validate_call
    def make_docs(
        self,
        texts: Iterable[str],
        max_length: int = None,
        disable: Iterable[str] = [],
        chunk_size: int = 1000,
    ) -> Iterable[Doc]:
        """Return a generator of docs from an iterable of texts, processing in chunks.

        Args:
            texts (Iterable[str]): The texts to process.
            max_length (int, optional): Maximum doc length.
            disable (Iterable[str], optional): Pipeline components to disable.
            chunk_size (int, optional): Number of docs to process per chunk.

        Yields:
            Doc: spaCy Doc objects.
        """
        if max_length:
            self.max_length = max_length
            self.nlp.max_length = max_length

        if not Token.has_extension("width"):
            Token.set_extension("width", default=0)
        enabled_pipes = [
            (name, proc) for name, proc in self.nlp.pipeline if name not in disable
        ]

        def chunker(iterable, size):
            chunk = []
            for item in iterable:
                chunk.append(item)
                if len(chunk) == size:
                    yield chunk
                    chunk = []
            if chunk:
                yield chunk

        for text_chunk in chunker(texts, chunk_size):
            docs = []
            for text in text_chunk:
                tokens, widths = self._get_token_widths(text)
                doc = Doc(self.nlp.vocab, words=tokens)
                for token, count in zip(doc, widths):
                    token._.width = count
                docs.append(doc)
            for _, proc in enabled_pipes:
                docs = [proc(doc) for doc in docs]
            yield from docs

`components: list[str]` `property` ¤

Return the spaCy pipeline components.

`disable: Optional[list[str]] = []` `pydantic-field` ¤

A list of spaCy pipeline components to disable.

`disabled: list[str]` `property` ¤

Return the disabled spaCy pipeline components.

`max_length: Optional[int] = 2000000` `pydantic-field` ¤

The maximum length of the doc.

`model: Optional[str] = 'xx_sent_ud_sm'` `pydantic-field` ¤

The name of the spaCy model to be used for tokenization.

`pipeline: list[str]` `property` ¤

Return the spaCy pipeline components.

`stopwords: Optional[list[str] | str] = []` `pydantic-field` ¤

A list of stop words to apply to docs.

`call(texts: str | Iterable[str]) -> Doc | Iterable[Doc]` ¤

Tokenize a string or an iterable of strings.

Parameters:

Name	Type	Description	Default
`texts`	`str \| Iterable[str]`	The text(s) to be tokenized.	required

Returns:

Type	Description
`Doc \| Iterable[Doc]`	Doc \| Iterable[Doc]: The tokenized doc(s).

Source code in lexos/tokenizer/__init__.py

@validate_call
def __call__(self, texts: str | Iterable[str]) -> Doc | Iterable[Doc]:
    """Tokenize a string or an iterable of strings.

    Args:
        texts (str | Iterable[str]): The text(s) to be tokenized.

    Returns:
        Doc | Iterable[Doc]: The tokenized doc(s).
    """
    if isinstance(texts, str):
        return self.make_doc(texts)
    elif isinstance(texts, Iterable):
        return self.make_docs(texts)

`init(**data) -> None` ¤

Initialise the Tokenizer class.

Source code in lexos/tokenizer/__init__.py

def __init__(self, **data) -> None:
    """Initialise the Tokenizer class."""
    super().__init__(**data)
    try:
        self.nlp = spacy.load(self.model)
        self.nlp.max_length = self.max_length
    except OSError:
        raise LexosException(
            f"Error loading model {self.model}. Please check the name and try again. You may need to install the model on your system."
        )

`add_extension(name: str, default: str) -> None` ¤

Add an extension to the spaCy Token class.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the extension.	required
`default`	`str`	The default value of the extension.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def add_extension(self, name: str, default: str) -> None:
    """Add an extension to the spaCy Token class.

    Args:
        name (str): The name of the extension.
        default (str): The default value of the extension.
    """
    if not Token.has_extension(name):
        Token.set_extension(name, default=default, force=True)

`add_stopwords(stopwords: str | list[str]) -> None` ¤

Add stopwords to the tokenizer.

Parameters:

Name	Type	Description	Default
`stopwords`	`str \| Iterable[str]`	A list of stopwords to add to the model.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def add_stopwords(self, stopwords: str | list[str]) -> None:
    """Add stopwords to the tokenizer.

    Args:
        stopwords (str | Iterable[str]): A list of stopwords to add to the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = True
    self.stopwords.extend(stopwords)

`make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc` ¤

Return a doc from a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	The text to be parsed.	required
`max_length`	`int`	The maximum length of the doc.	`None`
`disable`	`list[str]`	A list of spaCy pipeline components to disable.	`[]`

Returns:

Name	Type	Description
`Doc`	`Doc`	A spaCy doc object.

Source code in lexos/tokenizer/whitespace_counter.py

@validate_call
def make_doc(
    self, text: str, max_length: int = None, disable: list[str] = []
) -> Doc:
    """Return a doc from a text.

    Args:
        text (str): The text to be parsed.
        max_length (int): The maximum length of the doc.
        disable (list[str]): A list of spaCy pipeline components to disable.

    Returns:
        Doc: A spaCy doc object.
    """
    # Override instance settings with keyword arguments
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length
    if disable:
        self.nlp.select_pipes(disable=disable)
    tokens, widths = self._get_token_widths(text)
    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    doc = Doc(self.nlp.vocab, words=tokens)
    for token, count in zip(doc, widths):
        token._.width = count
    # Apply pipeline components manually, skipping those in 'disable'
    for name, proc in self.nlp.pipeline:
        if name not in disable:
            doc = proc(doc)
    return doc

`make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc]` ¤

Return a generator of docs from an iterable of texts, processing in chunks.

Parameters:

Name	Type	Description	Default
`texts`	`Iterable[str]`	The texts to process.	required
`max_length`	`int`	Maximum doc length.	`None`
`disable`	`Iterable[str]`	Pipeline components to disable.	`[]`
`chunk_size`	`int`	Number of docs to process per chunk.	`1000`

Yields:

Name	Type	Description
`Doc`	`Iterable[Doc]`	spaCy Doc objects.

Source code in lexos/tokenizer/whitespace_counter.py

@validate_call
def make_docs(
    self,
    texts: Iterable[str],
    max_length: int = None,
    disable: Iterable[str] = [],
    chunk_size: int = 1000,
) -> Iterable[Doc]:
    """Return a generator of docs from an iterable of texts, processing in chunks.

    Args:
        texts (Iterable[str]): The texts to process.
        max_length (int, optional): Maximum doc length.
        disable (Iterable[str], optional): Pipeline components to disable.
        chunk_size (int, optional): Number of docs to process per chunk.

    Yields:
        Doc: spaCy Doc objects.
    """
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length

    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    enabled_pipes = [
        (name, proc) for name, proc in self.nlp.pipeline if name not in disable
    ]

    def chunker(iterable, size):
        chunk = []
        for item in iterable:
            chunk.append(item)
            if len(chunk) == size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

    for text_chunk in chunker(texts, chunk_size):
        docs = []
        for text in text_chunk:
            tokens, widths = self._get_token_widths(text)
            doc = Doc(self.nlp.vocab, words=tokens)
            for token, count in zip(doc, widths):
                token._.width = count
            docs.append(doc)
        for _, proc in enabled_pipes:
            docs = [proc(doc) for doc in docs]
        yield from docs

`remove_extension(name: str) -> None` ¤

Remove an extension from the spaCy Token class.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the extension.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def remove_extension(self, name: str) -> None:
    """Remove an extension from the spaCy Token class.

    Args:
        name (str): The name of the extension.
    """
    if Token.has_extension(name):
        Token.remove_extension(name)

`remove_stopwords(stopwords: str | list[str]) -> None` ¤

Remove stopwords from the tokenizer.

Parameters:

Name	Type	Description	Default
`stopwords`	`str \| list[str]`	A list of stopwords to remove from the model.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def remove_stopwords(self, stopwords: str | list[str]) -> None:
    """Remove stopwords from the tokenizer.

    Args:
        stopwords (str | list[str]): A list of stopwords to remove from the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = False
    self.stopwords = [word for word in self.stopwords if word not in stopwords]

`model: Optional[str] = 'xx_sent_ud_sm'` `pydantic-field` ¤

The name of the spaCy model to be used for tokenization.

`max_length: Optional[int] = 2000000` `pydantic-field` ¤

The maximum length of the doc.

`disable: Optional[list[str]] = []` `pydantic-field` ¤

A list of spaCy pipeline components to disable.

`stopwords: Optional[list[str] | str] = []` `pydantic-field` ¤

A list of stop words to apply to docs.

`nlp: Optional[Language]` `pydantic-field` ¤

`model_config = ConfigDict(arbitrary_types_allowed=True, json_schema_extra=(DocJSONSchema.schema()), validate_assignment=True)` `class-attribute` `instance-attribute` ¤

`init(**data) -> None` ¤

Initialise the Tokenizer class.

Source code in lexos/tokenizer/__init__.py

def __init__(self, **data) -> None:
    """Initialise the Tokenizer class."""
    super().__init__(**data)
    try:
        self.nlp = spacy.load(self.model)
        self.nlp.max_length = self.max_length
    except OSError:
        raise LexosException(
            f"Error loading model {self.model}. Please check the name and try again. You may need to install the model on your system."
        )

`call(texts: str | Iterable[str]) -> Doc | Iterable[Doc]` ¤

Tokenize a string or an iterable of strings.

Parameters:

Name	Type	Description	Default
`texts`	`str \| Iterable[str]`	The text(s) to be tokenized.	required

Returns:

Type	Description
`Doc \| Iterable[Doc]`	Doc \| Iterable[Doc]: The tokenized doc(s).

Source code in lexos/tokenizer/__init__.py

@validate_call
def __call__(self, texts: str | Iterable[str]) -> Doc | Iterable[Doc]:
    """Tokenize a string or an iterable of strings.

    Args:
        texts (str | Iterable[str]): The text(s) to be tokenized.

    Returns:
        Doc | Iterable[Doc]: The tokenized doc(s).
    """
    if isinstance(texts, str):
        return self.make_doc(texts)
    elif isinstance(texts, Iterable):
        return self.make_docs(texts)

`pipeline: list[str]` `property` ¤

Return the spaCy pipeline components.

`components: list[str]` `property` ¤

Return the spaCy pipeline components.

`disabled: list[str]` `property` ¤

Return the disabled spaCy pipeline components.

`_get_token_widths(text: str) -> tuple[list[str], list[int]]` ¤

Get the widths of tokens in a doc.

Parameters:

Name	Type	Description	Default
`text`	`str`	The input text.	required

Returns:

Type	Description
`tuple[list[str], list[int]]`	tuple[list[str], list[int]]: A tuple containing the tokens and widths.

Source code in lexos/tokenizer/whitespace_counter.py

def _get_token_widths(self, text: str) -> tuple[list[str], list[int]]:
    """Get the widths of tokens in a doc.

    Args:
        text (str): The input text.

    Returns:
        tuple[list[str], list[int]]: A tuple containing the tokens and widths.
    """
    # Pattern: words, line breaks, or runs of spaces
    pattern = re.compile(r"([^\s\n]+)|(\n)|([ ]{2,})|([ ])")
    tokens = []
    widths = []
    for match in pattern.finditer(text):
        word, newline, multi_space, single_space = match.groups()
        if word:
            tokens.append(word)
            widths.append(len(word))  # Use number of characters in word
        elif newline:
            tokens.append("\n")
            widths.append(1)  # Use 1 to indicate a line break
        elif multi_space:
            tokens.append(" ")
            widths.append(len(multi_space))
        elif single_space:
            tokens.append(" ")
            widths.append(1)
    return tokens, widths

`add_extension(name: str, default: str) -> None` ¤

Add an extension to the spaCy Token class.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the extension.	required
`default`	`str`	The default value of the extension.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def add_extension(self, name: str, default: str) -> None:
    """Add an extension to the spaCy Token class.

    Args:
        name (str): The name of the extension.
        default (str): The default value of the extension.
    """
    if not Token.has_extension(name):
        Token.set_extension(name, default=default, force=True)

`add_stopwords(stopwords: str | list[str]) -> None` ¤

Add stopwords to the tokenizer.

Parameters:

Name	Type	Description	Default
`stopwords`	`str \| Iterable[str]`	A list of stopwords to add to the model.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def add_stopwords(self, stopwords: str | list[str]) -> None:
    """Add stopwords to the tokenizer.

    Args:
        stopwords (str | Iterable[str]): A list of stopwords to add to the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = True
    self.stopwords.extend(stopwords)

`make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc` ¤

Return a doc from a text.

Parameters:

Name	Type	Description	Default
`text`	`str`	The text to be parsed.	required
`max_length`	`int`	The maximum length of the doc.	`None`
`disable`	`list[str]`	A list of spaCy pipeline components to disable.	`[]`

Returns:

Name	Type	Description
`Doc`	`Doc`	A spaCy doc object.

Source code in lexos/tokenizer/whitespace_counter.py

@validate_call
def make_doc(
    self, text: str, max_length: int = None, disable: list[str] = []
) -> Doc:
    """Return a doc from a text.

    Args:
        text (str): The text to be parsed.
        max_length (int): The maximum length of the doc.
        disable (list[str]): A list of spaCy pipeline components to disable.

    Returns:
        Doc: A spaCy doc object.
    """
    # Override instance settings with keyword arguments
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length
    if disable:
        self.nlp.select_pipes(disable=disable)
    tokens, widths = self._get_token_widths(text)
    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    doc = Doc(self.nlp.vocab, words=tokens)
    for token, count in zip(doc, widths):
        token._.width = count
    # Apply pipeline components manually, skipping those in 'disable'
    for name, proc in self.nlp.pipeline:
        if name not in disable:
            doc = proc(doc)
    return doc

`make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc]` ¤

Return a generator of docs from an iterable of texts, processing in chunks.

Parameters:

Name	Type	Description	Default
`texts`	`Iterable[str]`	The texts to process.	required
`max_length`	`int`	Maximum doc length.	`None`
`disable`	`Iterable[str]`	Pipeline components to disable.	`[]`
`chunk_size`	`int`	Number of docs to process per chunk.	`1000`

Yields:

Name	Type	Description
`Doc`	`Iterable[Doc]`	spaCy Doc objects.

Source code in lexos/tokenizer/whitespace_counter.py

@validate_call
def make_docs(
    self,
    texts: Iterable[str],
    max_length: int = None,
    disable: Iterable[str] = [],
    chunk_size: int = 1000,
) -> Iterable[Doc]:
    """Return a generator of docs from an iterable of texts, processing in chunks.

    Args:
        texts (Iterable[str]): The texts to process.
        max_length (int, optional): Maximum doc length.
        disable (Iterable[str], optional): Pipeline components to disable.
        chunk_size (int, optional): Number of docs to process per chunk.

    Yields:
        Doc: spaCy Doc objects.
    """
    if max_length:
        self.max_length = max_length
        self.nlp.max_length = max_length

    if not Token.has_extension("width"):
        Token.set_extension("width", default=0)
    enabled_pipes = [
        (name, proc) for name, proc in self.nlp.pipeline if name not in disable
    ]

    def chunker(iterable, size):
        chunk = []
        for item in iterable:
            chunk.append(item)
            if len(chunk) == size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

    for text_chunk in chunker(texts, chunk_size):
        docs = []
        for text in text_chunk:
            tokens, widths = self._get_token_widths(text)
            doc = Doc(self.nlp.vocab, words=tokens)
            for token, count in zip(doc, widths):
                token._.width = count
            docs.append(doc)
        for _, proc in enabled_pipes:
            docs = [proc(doc) for doc in docs]
        yield from docs

`remove_extension(name: str) -> None` ¤

Remove an extension from the spaCy Token class.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the extension.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def remove_extension(self, name: str) -> None:
    """Remove an extension from the spaCy Token class.

    Args:
        name (str): The name of the extension.
    """
    if Token.has_extension(name):
        Token.remove_extension(name)

`remove_stopwords(stopwords: str | list[str]) -> None` ¤

Remove stopwords from the tokenizer.

Parameters:

Name	Type	Description	Default
`stopwords`	`str \| list[str]`	A list of stopwords to remove from the model.	required

Source code in lexos/tokenizer/__init__.py

@validate_call
def remove_stopwords(self, stopwords: str | list[str]) -> None:
    """Remove stopwords from the tokenizer.

    Args:
        stopwords (str | list[str]): A list of stopwords to remove from the model.
    """
    stopwords = ensure_list(stopwords)
    for term in stopwords:
        self.nlp.vocab[term].is_stop = False
    self.stopwords = [word for word in self.stopwords if word not in stopwords]

Whitespace Counter Tokenizer¤

WhitespaceCounter pydantic-model ¤

components: list[str] property ¤

disable: Optional[list[str]] = [] pydantic-field ¤

disabled: list[str] property ¤

max_length: Optional[int] = 2000000 pydantic-field ¤

model: Optional[str] = 'xx_sent_ud_sm' pydantic-field ¤

pipeline: list[str] property ¤

stopwords: Optional[list[str] | str] = [] pydantic-field ¤

__call__(texts: str | Iterable[str]) -> Doc | Iterable[Doc] ¤

__init__(**data) -> None ¤

add_extension(name: str, default: str) -> None ¤

add_stopwords(stopwords: str | list[str]) -> None ¤

make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc ¤

make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc] ¤

remove_extension(name: str) -> None ¤

remove_stopwords(stopwords: str | list[str]) -> None ¤

model: Optional[str] = 'xx_sent_ud_sm' pydantic-field ¤

max_length: Optional[int] = 2000000 pydantic-field ¤

disable: Optional[list[str]] = [] pydantic-field ¤

stopwords: Optional[list[str] | str] = [] pydantic-field ¤

nlp: Optional[Language] pydantic-field ¤

model_config = ConfigDict(arbitrary_types_allowed=True, json_schema_extra=(DocJSONSchema.schema()), validate_assignment=True) class-attribute instance-attribute ¤

__init__(**data) -> None ¤

__call__(texts: str | Iterable[str]) -> Doc | Iterable[Doc] ¤

pipeline: list[str] property ¤

components: list[str] property ¤

disabled: list[str] property ¤

_get_token_widths(text: str) -> tuple[list[str], list[int]] ¤

add_extension(name: str, default: str) -> None ¤

add_stopwords(stopwords: str | list[str]) -> None ¤

make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc ¤

make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc] ¤

remove_extension(name: str) -> None ¤

remove_stopwords(stopwords: str | list[str]) -> None ¤

`WhitespaceCounter` `pydantic-model` ¤

`components: list[str]` `property` ¤

`disable: Optional[list[str]] = []` `pydantic-field` ¤

`disabled: list[str]` `property` ¤

`max_length: Optional[int] = 2000000` `pydantic-field` ¤

`model: Optional[str] = 'xx_sent_ud_sm'` `pydantic-field` ¤

`pipeline: list[str]` `property` ¤

`stopwords: Optional[list[str] | str] = []` `pydantic-field` ¤

`call(texts: str | Iterable[str]) -> Doc | Iterable[Doc]` ¤

`init(**data) -> None` ¤

`add_extension(name: str, default: str) -> None` ¤

`add_stopwords(stopwords: str | list[str]) -> None` ¤

`make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc` ¤

`make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc]` ¤

`remove_extension(name: str) -> None` ¤

`remove_stopwords(stopwords: str | list[str]) -> None` ¤

`model: Optional[str] = 'xx_sent_ud_sm'` `pydantic-field` ¤

`max_length: Optional[int] = 2000000` `pydantic-field` ¤

`disable: Optional[list[str]] = []` `pydantic-field` ¤

`stopwords: Optional[list[str] | str] = []` `pydantic-field` ¤

`nlp: Optional[Language]` `pydantic-field` ¤

`model_config = ConfigDict(arbitrary_types_allowed=True, json_schema_extra=(DocJSONSchema.schema()), validate_assignment=True)` `class-attribute` `instance-attribute` ¤

`init(**data) -> None` ¤

`call(texts: str | Iterable[str]) -> Doc | Iterable[Doc]` ¤

`pipeline: list[str]` `property` ¤

`components: list[str]` `property` ¤

`disabled: list[str]` `property` ¤

`_get_token_widths(text: str) -> tuple[list[str], list[int]]` ¤

`add_extension(name: str, default: str) -> None` ¤

`add_stopwords(stopwords: str | list[str]) -> None` ¤

`make_doc(text: str, max_length: int = None, disable: list[str] = []) -> Doc` ¤

`make_docs(texts: Iterable[str], max_length: int = None, disable: Iterable[str] = [], chunk_size: int = 1000) -> Iterable[Doc]` ¤

`remove_extension(name: str) -> None` ¤

`remove_stopwords(stopwords: str | list[str]) -> None` ¤