Skip to content

Tokenizer¤

The Tokenizer uses spaCy to convert texts to documents using one of the functions below. If no language model is specified, spaCy's multi-lingual xx_sent_ud_sm model is used.

lexos.tokenizer._add_remove_stopwords(nlp, add_stopwords, remove_stopwords) ¤

Add and remove stopwords from the model.

Parameters:

Name Type Description Default
nlp spacy.Vocab

The model to add stopwords to.

required
add_stopwords Union[List[str], str]

A list of stopwords to add to the model.

required
remove_stopwords Union[bool, List[str], str]

A list of stopwords to remove from the model, or True to remove all stopwords.

required

Returns:

Type Description
spacy.Vocab

spacy.Vocab: The model with stopwords added and removed.

Source code in lexos\tokenizer\__init__.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def _add_remove_stopwords(
    nlp: spacy.Vocab,
    add_stopwords: Union[List[str], str],
    remove_stopwords: Union[bool, List[str], str],
) -> spacy.Vocab:
    """Add and remove stopwords from the model.

    Args:
        nlp (spacy.Vocab): The model to add stopwords to.
        add_stopwords (Union[List[str], str]): A list of stopwords to add to the model.
        remove_stopwords (Union[bool, List[str], str]): A list of stopwords to remove from the
                                                        model, or `True` to remove all stopwords.

    Returns:
        spacy.Vocab: The model with stopwords added and removed.
    """
    if add_stopwords:
        if not isinstance(add_stopwords, list):
            add_stopwords = [add_stopwords]
        for term in add_stopwords:
            nlp.vocab[term].is_stop = True
    if remove_stopwords:
        if remove_stopwords == True:
            for term in nlp.vocab:
                if term.is_stop:
                    term.is_stop = False
        else:
            if not isinstance(remove_stopwords, list):
                remove_stopwords = [remove_stopwords]
            for term in remove_stopwords:
                nlp.vocab[term].is_stop = False
    return nlp

lexos.tokenizer._get_disabled_components(disable=None, pipeline_components=None) ¤

Get a list of components to disable in the pipeline.

Source code in lexos\tokenizer\__init__.py
72
73
74
75
76
77
78
79
80
81
82
83
def _get_disabled_components(
    disable: List[str] = None, pipeline_components: dict = None
) -> List[str]:
    """Get a list of components to disable in the pipeline."""
    if disable is None:
        disable = []
    custom_disable = []
    if "disable" in pipeline_components:
        for component in pipeline_components["disable"]:
            custom_disable.append(component)
    disable.extend(custom_disable)
    return list(set(disable))

lexos.tokenizer._get_excluded_components(exclude=None, pipeline_components=None) ¤

Get a list of components to exclude from the pipeline.

Source code in lexos\tokenizer\__init__.py
58
59
60
61
62
63
64
65
66
67
68
69
def _get_excluded_components(
    exclude: List[str] = None, pipeline_components: dict = None
) -> List[str]:
    """Get a list of components to exclude from the pipeline."""
    if exclude is None:
        exclude = []
    custom_exclude = []
    if "exclude" in pipeline_components:
        for component in pipeline_components["exclude"]:
            custom_exclude.append(component)
    exclude.extend(custom_exclude)
    return list(set(exclude))

lexos.tokenizer._load_model(model, disable=None, exclude=None) ¤

Load a model from a file.

Parameters:

Name Type Description Default
model str

The path to the model.

required
disable List[str]

A list of spaCy pipeline components to disable.

None
exclude List[str]

A list of spaCy pipeline components to exclude.

None

Returns:

Name Type Description
object object

The loaded model.

Note

Attempts to disable or exclude components not found in the pipeline are ignored without raising an error.

Source code in lexos\tokenizer\__init__.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def _load_model(
    model: str, disable: List[str] = None, exclude: List[str] = None
) -> object:
    """Load a model from a file.

    Args:
        model (str): The path to the model.
        disable (List[str]): A list of spaCy pipeline components to disable.
        exclude (List[str]): A list of spaCy pipeline components to exclude.

    Returns:
        object: The loaded model.

    Note:
        Attempts to disable or exclude components not found in the pipeline are
        ignored without raising an error.
    """
    try:
        return spacy.load(model, disable=disable, exclude=exclude)
    except Exception:
        raise LexosException(
            f"Error loading model {model}. Please check the name and try again. You may need to install the model on your system."
        )

lexos.tokenizer._validate_input(input) ¤

Ensure that input is a string, Doc, or bytes.

Parameters:

Name Type Description Default
input Any

The input to be tested.

required

Returns:

Type Description
None

None

Raises:

Type Description
LexosException(Exception)

Raise an error if the input is not valid.

Source code in lexos\tokenizer\__init__.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def _validate_input(input: Any) -> None:
    """Ensure that input is a string, Doc, or bytes.

    Args:
        input (Any): The input to be tested.

    Returns:
        None

    Raises:
        LexosException (Exception): Raise an error if the input is not valid.
    """
    if not isinstance(input, list):
        input = [input]
    for item in input:
        if isinstance(item, (str, spacy.tokens.doc.Doc, bytes)):
            return True
        else:
            message = f"Error reading {item}. {LANG['format_error']} {str(type(item))}"
            raise LexosException(message)

lexos.tokenizer.doc_from_ngrams(ngrams, model='xx_sent_ud_sm', strict=False, disable=[], exclude=[]) ¤

Generate spaCy doc from a list of ngrams.

Parameters:

Name Type Description Default
ngrams list

A list of ngrams.

required
model object

The language model to use for tokenisation.

'xx_sent_ud_sm'
strict bool

Whether to preserve token divisions, include whitespace in the source.

False
disable List[str]

A list of spaCy pipeline components to disable.

[]
exclude List[str]

A list of spaCy pipeline components to exclude.

[]

Returns:

Name Type Description
object object

A spaCy doc

Notes

The strict=False setting will allow spaCy's language model to remove whitespace from ngrams and split punctuation into separate tokens. strict=True will preserve the sequences in the source list.

Source code in lexos\tokenizer\__init__.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def doc_from_ngrams(
    ngrams: list,
    model="xx_sent_ud_sm",
    strict: bool = False,
    disable: List[str] = [],
    exclude: List[str] = [],
) -> object:
    """Generate spaCy doc from a list of ngrams.

    Args:
        ngrams (list): A list of ngrams.
        model (object): The language model to use for tokenisation.
        strict (bool): Whether to preserve token divisions, include whitespace in the source.
        disable (List[str]): A list of spaCy pipeline components to disable.
        exclude (List[str]): A list of spaCy pipeline components to exclude.

    Returns:
        object: A spaCy doc

    Notes:
        The `strict=False` setting will allow spaCy's language model to remove whitespace from
        ngrams and split punctuation into separate tokens. `strict=True` will preserve the
        sequences in the source list.
    """
    nlp = _load_model(model, disable=disable, exclude=exclude)
    if strict:
        spaces = [False for token in ngrams if token != ""]
        doc = spacy.tokens.doc.Doc(nlp.vocab, words=ngrams, spaces=spaces)
        # Run the standard pipeline against the doc
        for _, proc in nlp.pipeline:
            doc = proc(doc)
        return doc
    else:
        text = " ".join([x.replace(" ", "") for x in ngrams])
        return nlp(text)

lexos.tokenizer.docs_from_ngrams(ngrams, model='xx_sent_ud_sm', strict=False, disable=[], exclude=[]) ¤

Generate spaCy doc from a list of ngram lists.

Parameters:

Name Type Description Default
ngrams List[list]

A list of ngram lists.

required
model object

The language model to use for tokenisation.

'xx_sent_ud_sm'
strict bool

Whether to preserve token divisions, include whitespace in the source.

False
disable List[str]

A list of spaCy pipeline components to disable.

[]
exclude List[str]

A list of spaCy pipeline components to exclude.

[]

Returns:

Type Description
List[object]

List[object]: A list of spaCy docs

Source code in lexos\tokenizer\__init__.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def docs_from_ngrams(
    ngrams: List[list],
    model="xx_sent_ud_sm",
    strict=False,
    disable: List[str] = [],
    exclude: List[str] = [],
) -> List[object]:
    """Generate spaCy doc from a list of ngram lists.

    Args:
        ngrams (List[list]): A list of ngram lists.
        model (object): The language model to use for tokenisation.
        strict (bool): Whether to preserve token divisions, include whitespace in the source.
        disable (List[str]): A list of spaCy pipeline components to disable.
        exclude (List[str]): A list of spaCy pipeline components to exclude.

    Returns:
        List[object]: A list of spaCy docs
    """
    docs = []
    for ngram_list in ngrams:
        doc = doc_from_ngrams(
            ngram_list, model, strict, disable=disable, exclude=exclude
        )
        docs.append(doc)
    return docs

lexos.tokenizer.generate_character_ngrams(text, size=1, drop_whitespace=True) ¤

Generate character n-grams from raw text.

Parameters:

Name Type Description Default
text str

The source text.

required
size int

The size of the ngram.

1
drop_whitespace bool

Whether to preserve whitespace in the ngram list.

True

Returns:

Type Description
List[str]

List[str]: A list of ngrams

Source code in lexos\tokenizer\__init__.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def generate_character_ngrams(
    text: str, size: int = 1, drop_whitespace: bool = True
) -> List[str]:
    """Generate character n-grams from raw text.

    Args:
        text (str): The source text.
        size (int): The size of the ngram.
        drop_whitespace (bool): Whether to preserve whitespace in the ngram list.

    Returns:
        List[str]: A list of ngrams
    """
    from textwrap import wrap

    return wrap(text, size, drop_whitespace=drop_whitespace)

lexos.tokenizer.make_doc(text, model='xx_sent_ud_sm', max_length=2000000, disable=[], exclude=[], add_stopwords=[], remove_stopwords=[], pipeline_components=[]) ¤

Return a doc from a text.

Parameters:

Name Type Description Default
text str

The text to be parsed.

required
model object

The model to be used.

'xx_sent_ud_sm'
max_length int

The maximum length of the doc.

2000000
disable List[str]

A list of spaCy pipeline components to disable.

[]
exclude List[str]

A list of spaCy pipeline components to exclude.

[]
add_stopwords Union[List[str], str]

A list of stop words to add to the model.

[]
remove_stopwords Union[bool, List[str], str]

A list of stop words to remove from the model. If True is specified, all stop words will be removed.

[]
pipeline_components List[dict]

A list custom component dicts to add to the pipeline. See https://spacy.io/api/language/#add_pipe for more information.

[]

Returns:

Name Type Description
object object

A spaCy doc object.

Source code in lexos\tokenizer\__init__.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def make_doc(
    text: str,
    model: object = "xx_sent_ud_sm",
    max_length: int = 2000000,
    disable: List[str] = [],
    exclude: List[str] = [],
    add_stopwords: Union[List[str], str] = [],
    remove_stopwords: Union[bool, List[str], str] = [],
    pipeline_components: List[dict] = [],
) -> object:
    """Return a doc from a text.

    Args:
        text (str): The text to be parsed.
        model (object): The model to be used.
        max_length (int): The maximum length of the doc.
        disable (List[str]): A list of spaCy pipeline components to disable.
        exclude (List[str]): A list of spaCy pipeline components to exclude.
        add_stopwords (Union[List[str], str]): A list of stop words to add to the model.
        remove_stopwords (Union[bool, List[str], str]): A list of stop words to remove
            from the model. If `True` is specified, all stop words will be removed.
        pipeline_components (List[dict]): A list custom component dicts to add
            to the pipeline. See https://spacy.io/api/language/#add_pipe for
            more information.

    Returns:
        object: A spaCy doc object.
    """
    _validate_input(text)
    disable = _get_disabled_components(disable, pipeline_components)
    exclude = _get_excluded_components(exclude, pipeline_components)
    nlp = _load_model(model, disable=disable, exclude=exclude)
    nlp.max_length = max_length
    _add_remove_stopwords(nlp, add_stopwords, remove_stopwords)
    if pipeline_components and "custom" in pipeline_components:
        for component in pipeline_components["custom"]:
            nlp.add_pipe(**component)
    return nlp(text)

lexos.tokenizer.make_docs(texts, model='xx_sent_ud_sm', max_length=2000000, disable=[], exclude=[], add_stopwords=[], remove_stopwords=[], pipeline_components=[]) ¤

Return a list of docs from a text or list of texts.

Parameters:

Name Type Description Default
texts Union[List[str], str]

The text(s) to be parsed.

required
model object

The model to be used.

'xx_sent_ud_sm'
max_length int

The maximum length of the doc.

2000000
disable List[str]

A list of spaCy pipeline components to disable.

[]
exclude List[str]

A list of spaCy pipeline components to exclude.

[]
add_stopwords Union[List[str], str]

A list of stop words to add to the model.

[]
remove_stopwords Union[bool, List[str], str]

A list of stop words to remove from the model. If True is specified, all stop words will be removed.

[]
pipeline_components List[dict]

A list custom component dicts to add to the pipeline. See https://spacy.io/api/language/#add_pipe for more information.

[]

Returns:

Type Description
List[object]

List[object]: A list of spaCy doc objects.

Source code in lexos\tokenizer\__init__.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def make_docs(
    texts: Union[List[str], str],
    model: object = "xx_sent_ud_sm",
    max_length: int = 2000000,
    disable: List[str] = [],
    exclude: List[str] = [],
    add_stopwords: Union[List[str], str] = [],
    remove_stopwords: Union[bool, List[str], str] = [],
    pipeline_components: List[dict] = [],
) -> List[object]:
    """Return a list of docs from a text or list of texts.

    Args:
        texts (Union[List[str], str]): The text(s) to be parsed.
        model (object): The model to be used.
        max_length (int): The maximum length of the doc.
        disable (List[str]): A list of spaCy pipeline components to disable.
        exclude (List[str]): A list of spaCy pipeline components to exclude.
        add_stopwords (Union[List[str], str]): A list of stop words to add to the model.
        remove_stopwords (Union[bool, List[str], str]): A list of stop words to remove
            from the model. If `True` is specified, all stop words will be removed.
        pipeline_components (List[dict]): A list custom component dicts to add
            to the pipeline. See https://spacy.io/api/language/#add_pipe for
            more information.

    Returns:
        List[object]: A list of spaCy doc objects.
    """
    if _validate_input(texts):
        disable = _get_disabled_components(disable, pipeline_components)
        exclude = _get_excluded_components(exclude, pipeline_components)
        nlp = _load_model(model, disable=disable, exclude=exclude)
        nlp.max_length = max_length
        _add_remove_stopwords(nlp, add_stopwords, remove_stopwords)
        if pipeline_components and "custom" in pipeline_components:
            for component in pipeline_components["custom"]:
                nlp.add_pipe(**component)
        return list(nlp.pipe(utils.ensure_list(texts)))

lexos.tokenizer.ngrams_from_doc(doc, size=2) ¤

Generate a list of ngrams from a spaCy doc.

A wrapper for textacy.extract.basics.ngrams. With basic functionality. Further functionality can be accessed by calling textacy directly.

Parameters:

Name Type Description Default
doc object

A spaCy doc

required
size int

The size of the ngrams.

2

Returns:

Type Description
List[str]

List[str]: A list of ngrams.

Source code in lexos\tokenizer\__init__.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def ngrams_from_doc(doc: object, size: int = 2) -> List[str]:
    """Generate a list of ngrams from a spaCy doc.

    A wrapper for `textacy.extract.basics.ngrams`. With basic functionality.
    Further functionality can be accessed by calling `textacy` directly.

    Args:
        doc (object): A spaCy doc
        size (int): The size of the ngrams.

    Returns:
        List[str]: A list of ngrams.
    """
    from textacy.extract.basics import ngrams as textacy_ngrams

    if size < 1:
        raise LexosException("The ngram size must be greater than 0.")
    ngrams = list(textacy_ngrams(doc, size, min_freq=1))
    # Ensure quoted strings are returned
    return [token.text for token in ngrams]