Skip to content

Machete¤

The Machete class allows the user to cut raw text strings. Documents can be split into a pre-determined number of segments, based on the number of tokens (using a non language-aware tokenizer), based on pre-defined token lists, or based on patterns defined as milestones.

lexos.cutter.machete.Machete ¤

Codename Machete.

Source code in lexos\cutter\machete.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
class Machete:
    """Codename Machete."""

    def __init__(self, tokenizer: str = "whitespace"):
        """Initialize the class."""
        self.tokenizer = tokenizer

    def _chunk_tokens(self, tokens: list, n: int = 1000) -> Callable:
        """Yield successive n-sized chunks from a list by a fixed number of tokens.

        Args:
            tokens (list): A list of tokens.
            n (int): The number of tokens to split on.

        Returns:
            list: A list of token lists (segments).
        """
        for i in range(0, len(tokens), n):
            yield tokens[i : i + n]

    def _create_overlapping_segments(
        self, segments: List[str], overlap: int
    ) -> List[str]:
        """Create overlapping segments.

        Args:
            segments (List[str]): A list of token strings.
            overlap (int): The number of tokens to overlap.

        Returns:
            List[str]: A list of token strings.
        """
        overlapped_segs = []
        for i, seg in enumerate(segments):
            if i == 0:
                # Get the first overlap tokens from the second segment
                overlapped_segs.append(seg + segments[i + 1][:overlap])
            else:
                if i < len(segments) - 1:
                    # Get the last overlap tokens from the previous segment
                    overlapped_segs.append(seg + segments[i + 1][:overlap])
                else:
                    # Get the last segment
                    overlapped_segs.append(seg)
        return overlapped_segs

    def _tokenize(self, text: str, tokenizer: str = None) -> list:
        """Tokenize an input string without a language model.

        Loads a tokenizer function from the registry.

        Args:
            text (str): The input string.

        Returns:
            list: A list of tokens.
        """
        if not tokenizer:
            tokenizer = registry.load(self.tokenizer)
        else:
            try:
                tokenizer = registry.load(tokenizer)
            except ValueError:
                raise LexosException(
                    "The specified tokenizer could not be found in the tokenizer registry."
                )
        return tokenizer(text)

    def merge(self, segments: List[str], sep=" ") -> str:
        """Merge a list of segments into a single string.

        Args:
            segments (List[str]): The list of segments to merge.
            sep (str): The separator to use.

        Returns:
            str: The merged string.
        """
        return sep.join(segments)

    def split(
        self,
        texts: Union[List[str], str],
        n=1000,
        merge_threshold: float = 0.5,
        overlap: int = None,
        tokenizer: str = None,
        as_string: bool = True,
    ) -> list:
        """Split texts into chunks by a fixed number of tokens.

        Args:
            texts (Union[List[str], str]): A text string or list of text strings.
            n (int): The number of tokens to split on.
            merge_threshold (float): The threshold to merge the last segment.
            overlap (int): The number of tokens to overlap.
            tokenizer (str): The name of the tokenizer function to use.
            as_string (bool): Whether to return the segments as a list of strings.

        Returns:
            list: A list of lists or strings (segments) for each text.
        """
        # Validate input
        try:
            model = SplitModel(
                texts=texts,
                n=n,
                merge_threshold=merge_threshold,
                overlap=overlap,
                tokenizer=tokenizer,
                as_string=as_string,
            )
        except Exception as e:
            raise LexosException(e)

        # Ensure a list of texts as the starting point
        if not isinstance(model.texts, list):
            model.texts = [model.texts]

        # Process the texts into segments
        all_segments = []
        for text in model.texts:
            # Tokenise the text
            tokens = self._tokenize(text, tokenizer=model.tokenizer)
            segments = list(self._chunk_tokens(tokens, model.n))
            # Apply the merge threshold
            if len(segments[-1]) < model.n * model.merge_threshold:
                last_seg = segments.pop(-1)
                # Combine the last two segments into a single list
                segments[-1] = segments[-1] + last_seg
            all_segments.append(segments)
        if overlap:
            all_segments = [
                self._create_overlapping_segments(segment, overlap)
                for segment in all_segments
            ]
        if as_string:
            all_segments = [
                ["".join(segment) for segment in text] for text in all_segments
            ]
        return all_segments

    def splitn(
        self,
        texts: Union[List[str], str],
        n: int = 2,
        merge_threshold: float = 0.5,
        overlap: int = None,
        tokenizer: str = None,
        as_string: bool = True,
    ) -> list:
        """Get a specific number of sequential segments from a string or list of strings.

        Args:
            texts (Union[List[str], str]): A text string or list of text strings.
            n (int): The number of segments to create. Calculated automatically.
            merge_threshold (float): The threshold to merge the last segment.
            overlap (int): The number of tokens to overlap.
            tokenizer (str): The name of the tokenizer function to use.
            as_string (bool): Whether to return the segments as a list of strings.

        Returns:
            list: A list of lists or strings (segments) for each text.

        Note:
            For this implementation, see https://stackoverflow.com/a/54802737.
        """
        # Validate input
        try:
            model = SplitModel(
                texts=texts,
                n=n,
                merge_threshold=merge_threshold,
                overlap=overlap,
                tokenizer=tokenizer,
                as_string=as_string,
            )
        except Exception as e:
            raise LexosException(e)

        # Ensure a list of texts as the starting point
        if not isinstance(model.texts, list):
            model.texts = [model.texts]

        # Process the texts into segments
        all_segments = []
        for text in model.texts:

            # Tokenise the text
            tokens = self._tokenize(text, tokenizer=model.tokenizer)

            # Get the number of tokens per segment (d) and the remaining tokens (r)
            d, r = divmod(len(tokens), model.n)

            # Get the segments
            segments = []
            for i in range(model.n):
                index = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
                segments.append(tokens[index : index + (d + 1 if i < r else d)])
                # Apply the merge threshold
                if len(segments[-1]) < model.n * model.merge_threshold:
                    last_seg = segments.pop(-1)
                    # Combine the last two segments into a single list
                    segments[-1] = segments[-1] + last_seg
            all_segments.append(segments)
            if overlap:
                all_segments = [
                    self._create_overlapping_segments(segment, model.overlap)
                    for segment in all_segments
                ]
            if as_string:
                all_segments = [
                    ["".join(segment) for segment in text] for text in all_segments
                ]
        return all_segments

    def split_list(
        self,
        text: List[str],
        n: int = 1000,
        merge_threshold: float = 0.5,
        overlap: int = None,
        as_string: bool = False,
    ) -> list:
        """Split a list into chunks by a fixed number of tokens.

        Args:
            text (List[str]): A list of tokens.
            n (int): The number of tokens to split on.
            merge_threshold (float): The threshold to merge the last segment.
            overlap (int): The number of tokens to overlap.
            as_string (bool): Whether to return the segments as a list of strings.

        Returns:
            list: A list of token lists, one token list for each segment.
        """
        # Validate input
        try:
            model = SplitListModel(
                text=text,
                n=n,
                merge_threshold=merge_threshold,
                overlap=overlap,
                as_string=as_string,
            )
        except Exception as e:
            raise LexosException(e)

        # Ensure a list of texts as the starting point
        if isinstance(model.text[0], str):
            model.text = [model.text]

        # Process the texts into segments
        all_segments = []
        for text in model.text:
            segments = list(self._chunk_tokens(text, model.n))
            # Apply the merge threshold
            if len(segments[-1]) < model.n * model.merge_threshold:
                last_seg = segments.pop(-1)
                # Combine the last two segments into a single list
                segments[-1] = [segments[-1] + last_seg]
            all_segments.append(segments)
            if overlap:
                all_segments = [
                    self._create_overlapping_segments(segment, model.overlap)
                    for segment in all_segments
                ]
            if as_string:
                all_segments = [
                    ["".join(segment) for segment in text] for text in all_segments
                ]
        return all_segments

    def split_on_milestones(
        self,
        texts: Union[List[str], str],
        milestone: Union[dict, str],
        preserve_milestones: bool = True,
        tokenizer: str = None,
        as_string: bool = True,
    ) -> list:
        """Split texts on milestones.

        Args:
            texts (Union[List[str], str]): A text string or list of text strings.
            milestone (Union[dict, str]): A variable representing the value(s) to be matched.
            preserve_milestones (bool): If True, the milestone token will be preserved at the
                beginning of every segment. Otherwise, it will be deleted.
            tokenizer (str): The name of the tokenizer function to use.
            as_string (bool): Whether to return the segments as a list of strings.

        Returns:
            list: A list of lists or strings (segments) for each text.

        Note:
            The choice of tokenizer can lead to some unexpected results with regard to spacing
            around the milestone. The default behaviour is to delete the milestone and any
            following whitespace. If milestones are preserved, the milestone will occur at the
            beginning of the following segment and will be followed by a single space. If the
            segments are returned with `as_string=False`, each token will have a following space
            and it will be up to the end user to remove the space if desired.
        """
        # Validate input
        try:
            model = SplitMilestoneModel(
                texts=texts,
                milestone=milestone,
                preserve_milestones=preserve_milestones,
                tokenizer=tokenizer,
                as_string=as_string,
            )
        except Exception as e:
            raise LexosException(e)

        # Ensure a list of texts as the starting point
        if not isinstance(model.texts, list):
            model.texts = [model.texts]

        # Process the texts into segments
        all_segments = []
        milestone_pat = re.compile(milestone)
        for text in model.texts:
            cut_on_milestone = []
            seg = []
            # Tokenise the text
            tokens = self._tokenize(text, tokenizer=model.tokenizer)
            for i, token in enumerate(tokens):
                if re.match(
                    milestone_pat, token.strip()
                ):  # token.strip() == milestone:
                    cut_on_milestone.append(seg)
                    j = i
                    if preserve_milestones:
                        seg = [f"{milestone} "]
                    else:
                        seg = []
                else:
                    seg.append(token)
            # Add the last segment
            cut_on_milestone.append(tokens[j + 1 :])
            all_segments.append(cut_on_milestone)
        if as_string:
            all_segments = [
                ["".join(segment) for segment in text] for text in all_segments
            ]
        # If no milestone was found, return the original texts
        if len(all_segments) == 1 and all_segments[0] == []:
            return [model.texts]
        else:
            return all_segments

__init__(tokenizer='whitespace') ¤

Initialize the class.

Source code in lexos\cutter\machete.py
48
49
50
def __init__(self, tokenizer: str = "whitespace"):
    """Initialize the class."""
    self.tokenizer = tokenizer

merge(segments, sep=' ') ¤

Merge a list of segments into a single string.

Parameters:

Name Type Description Default
segments List[str]

The list of segments to merge.

required
sep str

The separator to use.

' '

Returns:

Name Type Description
str str

The merged string.

Source code in lexos\cutter\machete.py
113
114
115
116
117
118
119
120
121
122
123
def merge(self, segments: List[str], sep=" ") -> str:
    """Merge a list of segments into a single string.

    Args:
        segments (List[str]): The list of segments to merge.
        sep (str): The separator to use.

    Returns:
        str: The merged string.
    """
    return sep.join(segments)

split(texts, n=1000, merge_threshold=0.5, overlap=None, tokenizer=None, as_string=True) ¤

Split texts into chunks by a fixed number of tokens.

Parameters:

Name Type Description Default
texts Union[List[str], str]

A text string or list of text strings.

required
n int

The number of tokens to split on.

1000
merge_threshold float

The threshold to merge the last segment.

0.5
overlap int

The number of tokens to overlap.

None
tokenizer str

The name of the tokenizer function to use.

None
as_string bool

Whether to return the segments as a list of strings.

True

Returns:

Name Type Description
list list

A list of lists or strings (segments) for each text.

Source code in lexos\cutter\machete.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def split(
    self,
    texts: Union[List[str], str],
    n=1000,
    merge_threshold: float = 0.5,
    overlap: int = None,
    tokenizer: str = None,
    as_string: bool = True,
) -> list:
    """Split texts into chunks by a fixed number of tokens.

    Args:
        texts (Union[List[str], str]): A text string or list of text strings.
        n (int): The number of tokens to split on.
        merge_threshold (float): The threshold to merge the last segment.
        overlap (int): The number of tokens to overlap.
        tokenizer (str): The name of the tokenizer function to use.
        as_string (bool): Whether to return the segments as a list of strings.

    Returns:
        list: A list of lists or strings (segments) for each text.
    """
    # Validate input
    try:
        model = SplitModel(
            texts=texts,
            n=n,
            merge_threshold=merge_threshold,
            overlap=overlap,
            tokenizer=tokenizer,
            as_string=as_string,
        )
    except Exception as e:
        raise LexosException(e)

    # Ensure a list of texts as the starting point
    if not isinstance(model.texts, list):
        model.texts = [model.texts]

    # Process the texts into segments
    all_segments = []
    for text in model.texts:
        # Tokenise the text
        tokens = self._tokenize(text, tokenizer=model.tokenizer)
        segments = list(self._chunk_tokens(tokens, model.n))
        # Apply the merge threshold
        if len(segments[-1]) < model.n * model.merge_threshold:
            last_seg = segments.pop(-1)
            # Combine the last two segments into a single list
            segments[-1] = segments[-1] + last_seg
        all_segments.append(segments)
    if overlap:
        all_segments = [
            self._create_overlapping_segments(segment, overlap)
            for segment in all_segments
        ]
    if as_string:
        all_segments = [
            ["".join(segment) for segment in text] for text in all_segments
        ]
    return all_segments

split_list(text, n=1000, merge_threshold=0.5, overlap=None, as_string=False) ¤

Split a list into chunks by a fixed number of tokens.

Parameters:

Name Type Description Default
text List[str]

A list of tokens.

required
n int

The number of tokens to split on.

1000
merge_threshold float

The threshold to merge the last segment.

0.5
overlap int

The number of tokens to overlap.

None
as_string bool

Whether to return the segments as a list of strings.

False

Returns:

Name Type Description
list list

A list of token lists, one token list for each segment.

Source code in lexos\cutter\machete.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def split_list(
    self,
    text: List[str],
    n: int = 1000,
    merge_threshold: float = 0.5,
    overlap: int = None,
    as_string: bool = False,
) -> list:
    """Split a list into chunks by a fixed number of tokens.

    Args:
        text (List[str]): A list of tokens.
        n (int): The number of tokens to split on.
        merge_threshold (float): The threshold to merge the last segment.
        overlap (int): The number of tokens to overlap.
        as_string (bool): Whether to return the segments as a list of strings.

    Returns:
        list: A list of token lists, one token list for each segment.
    """
    # Validate input
    try:
        model = SplitListModel(
            text=text,
            n=n,
            merge_threshold=merge_threshold,
            overlap=overlap,
            as_string=as_string,
        )
    except Exception as e:
        raise LexosException(e)

    # Ensure a list of texts as the starting point
    if isinstance(model.text[0], str):
        model.text = [model.text]

    # Process the texts into segments
    all_segments = []
    for text in model.text:
        segments = list(self._chunk_tokens(text, model.n))
        # Apply the merge threshold
        if len(segments[-1]) < model.n * model.merge_threshold:
            last_seg = segments.pop(-1)
            # Combine the last two segments into a single list
            segments[-1] = [segments[-1] + last_seg]
        all_segments.append(segments)
        if overlap:
            all_segments = [
                self._create_overlapping_segments(segment, model.overlap)
                for segment in all_segments
            ]
        if as_string:
            all_segments = [
                ["".join(segment) for segment in text] for text in all_segments
            ]
    return all_segments

split_on_milestones(texts, milestone, preserve_milestones=True, tokenizer=None, as_string=True) ¤

Split texts on milestones.

Parameters:

Name Type Description Default
texts Union[List[str], str]

A text string or list of text strings.

required
milestone Union[dict, str]

A variable representing the value(s) to be matched.

required
preserve_milestones bool

If True, the milestone token will be preserved at the beginning of every segment. Otherwise, it will be deleted.

True
tokenizer str

The name of the tokenizer function to use.

None
as_string bool

Whether to return the segments as a list of strings.

True

Returns:

Name Type Description
list list

A list of lists or strings (segments) for each text.

Note

The choice of tokenizer can lead to some unexpected results with regard to spacing around the milestone. The default behaviour is to delete the milestone and any following whitespace. If milestones are preserved, the milestone will occur at the beginning of the following segment and will be followed by a single space. If the segments are returned with as_string=False, each token will have a following space and it will be up to the end user to remove the space if desired.

Source code in lexos\cutter\machete.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def split_on_milestones(
    self,
    texts: Union[List[str], str],
    milestone: Union[dict, str],
    preserve_milestones: bool = True,
    tokenizer: str = None,
    as_string: bool = True,
) -> list:
    """Split texts on milestones.

    Args:
        texts (Union[List[str], str]): A text string or list of text strings.
        milestone (Union[dict, str]): A variable representing the value(s) to be matched.
        preserve_milestones (bool): If True, the milestone token will be preserved at the
            beginning of every segment. Otherwise, it will be deleted.
        tokenizer (str): The name of the tokenizer function to use.
        as_string (bool): Whether to return the segments as a list of strings.

    Returns:
        list: A list of lists or strings (segments) for each text.

    Note:
        The choice of tokenizer can lead to some unexpected results with regard to spacing
        around the milestone. The default behaviour is to delete the milestone and any
        following whitespace. If milestones are preserved, the milestone will occur at the
        beginning of the following segment and will be followed by a single space. If the
        segments are returned with `as_string=False`, each token will have a following space
        and it will be up to the end user to remove the space if desired.
    """
    # Validate input
    try:
        model = SplitMilestoneModel(
            texts=texts,
            milestone=milestone,
            preserve_milestones=preserve_milestones,
            tokenizer=tokenizer,
            as_string=as_string,
        )
    except Exception as e:
        raise LexosException(e)

    # Ensure a list of texts as the starting point
    if not isinstance(model.texts, list):
        model.texts = [model.texts]

    # Process the texts into segments
    all_segments = []
    milestone_pat = re.compile(milestone)
    for text in model.texts:
        cut_on_milestone = []
        seg = []
        # Tokenise the text
        tokens = self._tokenize(text, tokenizer=model.tokenizer)
        for i, token in enumerate(tokens):
            if re.match(
                milestone_pat, token.strip()
            ):  # token.strip() == milestone:
                cut_on_milestone.append(seg)
                j = i
                if preserve_milestones:
                    seg = [f"{milestone} "]
                else:
                    seg = []
            else:
                seg.append(token)
        # Add the last segment
        cut_on_milestone.append(tokens[j + 1 :])
        all_segments.append(cut_on_milestone)
    if as_string:
        all_segments = [
            ["".join(segment) for segment in text] for text in all_segments
        ]
    # If no milestone was found, return the original texts
    if len(all_segments) == 1 and all_segments[0] == []:
        return [model.texts]
    else:
        return all_segments

splitn(texts, n=2, merge_threshold=0.5, overlap=None, tokenizer=None, as_string=True) ¤

Get a specific number of sequential segments from a string or list of strings.

Parameters:

Name Type Description Default
texts Union[List[str], str]

A text string or list of text strings.

required
n int

The number of segments to create. Calculated automatically.

2
merge_threshold float

The threshold to merge the last segment.

0.5
overlap int

The number of tokens to overlap.

None
tokenizer str

The name of the tokenizer function to use.

None
as_string bool

Whether to return the segments as a list of strings.

True

Returns:

Name Type Description
list list

A list of lists or strings (segments) for each text.

Note

For this implementation, see https://stackoverflow.com/a/54802737.

Source code in lexos\cutter\machete.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def splitn(
    self,
    texts: Union[List[str], str],
    n: int = 2,
    merge_threshold: float = 0.5,
    overlap: int = None,
    tokenizer: str = None,
    as_string: bool = True,
) -> list:
    """Get a specific number of sequential segments from a string or list of strings.

    Args:
        texts (Union[List[str], str]): A text string or list of text strings.
        n (int): The number of segments to create. Calculated automatically.
        merge_threshold (float): The threshold to merge the last segment.
        overlap (int): The number of tokens to overlap.
        tokenizer (str): The name of the tokenizer function to use.
        as_string (bool): Whether to return the segments as a list of strings.

    Returns:
        list: A list of lists or strings (segments) for each text.

    Note:
        For this implementation, see https://stackoverflow.com/a/54802737.
    """
    # Validate input
    try:
        model = SplitModel(
            texts=texts,
            n=n,
            merge_threshold=merge_threshold,
            overlap=overlap,
            tokenizer=tokenizer,
            as_string=as_string,
        )
    except Exception as e:
        raise LexosException(e)

    # Ensure a list of texts as the starting point
    if not isinstance(model.texts, list):
        model.texts = [model.texts]

    # Process the texts into segments
    all_segments = []
    for text in model.texts:

        # Tokenise the text
        tokens = self._tokenize(text, tokenizer=model.tokenizer)

        # Get the number of tokens per segment (d) and the remaining tokens (r)
        d, r = divmod(len(tokens), model.n)

        # Get the segments
        segments = []
        for i in range(model.n):
            index = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
            segments.append(tokens[index : index + (d + 1 if i < r else d)])
            # Apply the merge threshold
            if len(segments[-1]) < model.n * model.merge_threshold:
                last_seg = segments.pop(-1)
                # Combine the last two segments into a single list
                segments[-1] = segments[-1] + last_seg
        all_segments.append(segments)
        if overlap:
            all_segments = [
                self._create_overlapping_segments(segment, model.overlap)
                for segment in all_segments
            ]
        if as_string:
            all_segments = [
                ["".join(segment) for segment in text] for text in all_segments
            ]
    return all_segments

lexos.cutter.machete.SplitListModel ¤

Bases: BaseModel

Validate the input for split_list function.

Source code in lexos\cutter\machete.py
15
16
17
18
19
20
21
class SplitListModel(BaseModel):
    """Validate the input for split_list function."""

    text: List[str]
    n: Optional[int] = 1000
    merge_threshold: Optional[float] = 0.5
    overlap: Optional[int] = None

lexos.cutter.machete.SplitMilestoneModel ¤

Bases: BaseModel

Validate the input for split_on_miletone function.

Source code in lexos\cutter\machete.py
24
25
26
27
28
29
30
31
class SplitMilestoneModel(BaseModel):
    """Validate the input for split_on_miletone function."""

    texts: Union[List[str], str]
    milestone: Union[dict, str]
    preserve_milestones: Optional[bool] = False
    tokenizer: Optional[str] = None
    as_string: Optional[bool] = True

lexos.cutter.machete.SplitModel ¤

Bases: BaseModel

Validate the input for split functions.

Source code in lexos\cutter\machete.py
34
35
36
37
38
39
40
41
42
class SplitModel(BaseModel):
    """Validate the input for split functions."""

    texts: Union[List[str], str]
    n: Optional[int] = 1000
    merge_threshold: Optional[float] = 0.5
    overlap: Optional[int] = None
    tokenizer: Optional[str] = None
    as_string: Optional[bool] = True