Skip to content

Token Milestones¤

Class for handling token milestones.

TokenMilestones pydantic-model ¤

Bases: BaseModel

Milestones class.

  • Referencing the Milestones instance yields an iterator of the spans in the Doc.
  • Referencing Milestones.spans returns an indexed list of spans in the Doc.
  • Referencing milestones.doc.spans["milestones"] returns a SpanGroup.

Config:

  • default: validation_config

Fields:

  • doc (Doclike)
  • patterns (Optional[Any])
  • case_sensitive (Optional[bool])
  • character_map (Optional[dict])
  • attr (Optional[str])
  • flags (Optional[Enum])
  • mode (Optional[str])
  • nlp (Optional[str])
  • type (Optional[str])
Source code in lexos/milestones/token_milestones.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
class TokenMilestones(BaseModel):
    """Milestones class.

    - Referencing the Milestones instance yields an iterator of the spans in the Doc.
    - Referencing Milestones.spans returns an indexed list of spans in the Doc.
    - Referencing milestones.doc.spans["milestones"] returns a SpanGroup.
    """

    doc: Doclike = Field(
        json_schema_extra={"description": "A string or spaCy Doc object."}
    )

    patterns: Optional[Any] = Field(
        default=None,
        json_schema_extra={"description": "The pattern(s) used to match milestones."},
    )
    case_sensitive: Optional[bool] = Field(
        default=True,
        json_schema_extra={
            "description": "Whether to perform case-sensitive searches."
        },
    )
    character_map: Optional[dict] = Field(
        default=None,
        json_schema_extra={"description": "A map of characters to token indexes."},
    )
    attr: Optional[str] = Field(
        default="ORTH",
        json_schema_extra={
            "description": "The spaCy token attribute to search ('ORTH' or 'LOWER')."
        },
    )
    flags: Optional[Enum] = Field(
        default=case_sensitive_flags,
        json_schema_extra={"description": "The regex flags to use."},
    )
    mode: Optional[str] = Field(
        default="string",
        json_schema_extra={"description": "The mode used to match patterns."},
    )
    nlp: Optional[str] = Field(
        default="xx_sent_ud_sm",
        json_schema_extra={"description": "The language model to use."},
    )
    type: Optional[str] = Field(
        default=None, json_schema_extra={"description": "The type of milestone."}
    )

    model_config = validation_config

    def __init__(self, **data) -> None:
        """Set regex flags and milestone IOB extensions after initialization."""
        super().__init__(**data)
        if not self.case_sensitive:
            self.flags = case_insensitive_flags
            self.attr = "LOWER"
        if not Token.has_extension("milestone_iob"):
            Token.set_extension("milestone_iob", default="O", force=True)
        if not Token.has_extension("milestone_label"):
            Token.set_extension("milestone_label", default="", force=True)

    @property
    def spans(self) -> list[Span]:
        """Return the Spans.

        Returns:
            list[Span]: A list of spaCy Spans.
        """
        if "milestones" in self.doc.spans:
            return list(self.doc.spans["milestones"])
        else:
            return []

    def __iter__(self) -> Iterator:
        """Make the class iterable.

        Returns:
            Iterator: A generator containing the object's spans.
        """
        return (span for span in self.spans)

    def _assign_token_attributes(
        self, spans: list[Span], max_label_length: int = 20
    ) -> None:
        """Assign token attributes in the doc based on spans.

        Args:
            spans (list[Span]): A list of spaCy Spans.
            max_label_length (int): The maximum number of characters to include in the label.
        """
        # Early return if no spans
        if not spans:
            for token in self.doc:
                self.doc[token.i]._.milestone_iob = "O"
                self.doc[token.i]._.milestone_label = ""
            return

        # Pre-compute token positions and labels
        milestone_starts = {span.start: span for span in spans}
        milestone_ranges = {token.i for span in spans for token in span[1:]}

        # Assign attributes in single pass
        for token in self.doc:
            if span := milestone_starts.get(token.i):
                self.doc[token.i]._.milestone_iob = "B"
                self.doc[
                    token.i
                ]._.milestone_label = f"{span.text:.{max_label_length}}{'...' if len(span.text) > max_label_length else ''}"
            elif token.i in milestone_ranges:
                self.doc[token.i]._.milestone_iob = "I"
                self.doc[token.i]._.milestone_label = ""
            else:
                self.doc[token.i]._.milestone_iob = "O"
                self.doc[token.i]._.milestone_label = ""

    def _autodetect_mode(self, patterns: str | list) -> str:
        """Autodetect mode for matching milestones if not supplied (experimental).

        Args:
            patterns (str | list): A pattern to match.

        Returns:
            str: A string to supply to the get_matches() mode argument.
        """
        for pattern in patterns:
            if not isinstance(pattern, (str, list)):
                raise ValueError(
                    f"Pattern {pattern} must be a string or a spaCy Matcher rule."
                )
            if isinstance(pattern, str):
                if re.search(r"\s", pattern):
                    self.mode = "phrase"
                else:
                    self.mode = "string"
            else:
                try:
                    matcher = Matcher(self.doc.vocab, validate=True)
                    matcher.add("Pattern", [pattern])
                    self.mode = "rule"
                # Raise an error if the pattern is not a valid Matcher pattern
                except BaseException:
                    raise BaseException(
                        f"The pattern `{pattern}` could not be matched automatically. Check that the pattern is correct and try setting the `mode` argument in `get_matches()`."
                    )
        return self.mode

    def _get_string_matches(self, patterns: Any, flags: Enum) -> list[Span]:
        """Get matches to milestone patterns.

        Args:
            patterns (Any): A pattern to match.
            flags (Enum): An enum of regex flags.

        Returns:
            list[Span]: A list of Spans matching the pattern.
        """
        if patterns is None or patterns == []:
            raise ValueError("Patterns cannot be empty")
        patterns = ensure_list(patterns)
        if self.character_map is None:
            self.character_map = chars_to_tokens(self.doc)
        pattern_matches = []
        for pattern in patterns:
            matches = re.finditer(pattern, self.doc.text, flags=flags)
            for match in matches:
                pattern_matches.append(match)
        return [self._to_spacy_span(match) for match in pattern_matches]

    def _get_phrase_matches(self, patterns: Any, attr: str = "ORTH") -> list[Span]:
        """Get matches to milestone patterns in phrases.

        Args:
            patterns (Any): A pattern to match.
            attr (str): A spaCy Token attribute to search.

        Returns:
            list[Span]: A list of Spans matching the pattern.
        """
        nlp = spacy.load(self.nlp)
        matcher = PhraseMatcher(self.doc.vocab, attr=attr)
        patterns = [nlp.make_doc(text) for text in patterns]
        matcher.add("PatternList", patterns)
        matches = matcher(self.doc)
        return [self.doc[start:end] for _, start, end in matches]

    def _get_rule_matches(self, patterns: Any) -> list[Span]:
        """Get matches to milestone patterns with spaCy rules.

        Args:
            patterns (Any): A pattern to match.

        Returns:
            list[Span]: A list of Spans matching the pattern.
        """
        nlp = spacy.load(self.nlp)
        spans = []
        if not self.case_sensitive:
            patterns = lowercase_spacy_rules(patterns)
        for pattern in patterns:
            matcher = Matcher(nlp.vocab, validate=True)
            matcher.add("Pattern", [pattern])
            matches = matcher(self.doc)
            spans.extend([self.doc[start:end] for _, start, end in matches])
        return spans

    def _remove_duplicate_spans(self, spans: list[Span]) -> list[Span]:
        """Remove duplicate spans, generally created when a pattern is added.

        Args:
            spans (list[Span]): A list of Spans.

        Returns:
            list[Span]: A list of de-duplicated Spans.
        """
        result = []
        seen = set()
        for span in spans:
            key = (span.start, span.end)
            if key not in seen:
                result.append(span)
                seen.add(key)
        return result

    def _set_case_sensitivity(self, case_sensitive: Optional[bool] = None) -> None:
        """Set the object's case sensitivity.

        Args:
            case_sensitive (optional, bool): Whether or not to use case-sensitive searching.
        """
        if case_sensitive is not None:
            self.case_sensitive = case_sensitive
        if self.case_sensitive is True:
            self.flags: Enum = re.DOTALL | re.MULTILINE | re.UNICODE
            self.attr = "ORTH"
        else:
            self.flags: Enum = re.DOTALL | re.IGNORECASE | re.MULTILINE | re.UNICODE
            self.attr = "LOWER"

    def _to_spacy_span(self, match: Match) -> Span:
        """Convert a re.match object to a Span.

        Args:
            match (Match): A re.match object.

        Returns:
            Span: A spaCy Span.

        Raises:
            ValueError: If match is None or span cannot be created.
        """
        if not match:
            raise ValueError("Match object is None.")

        # Lazy load character map
        if not self.character_map:
            self.character_map = chars_to_tokens(self.doc)

        # Get character positions
        start_char, end_char = match.span()

        # Try direct char_span first
        if span := self.doc.char_span(start_char, end_char):
            return span

        # Fallback to character map
        start_token = self.character_map.get(start_char)
        end_token = self.character_map.get(end_char)

        if start_token is not None and end_token is not None:
            if span := self.doc[start_token : end_token + 1]:
                return span

        raise ValueError(
            f"Could not create span for match at positions {start_char}:{end_char}"
        )

    @validate_call(config=validation_config)
    def get_matches(
        self,
        patterns: Optional[Any] = None,
        mode: Optional[str] = None,
        case_sensitive: Optional[bool] = None,
    ) -> list[Span]:
        """Get matches to milestone patterns.

        Args:
            patterns (Optional[Any]): The pattern(s) to match.
            mode (Optional[str]): The mode to use for matching ('string', 'phrase', 'rule').
            case_sensitive (Optional[bool]): Whether to use case sensitive matching. Defaults to True.

        Returns:
            list[Span]: A list of spaCy Spans matching the pattern.

        Raises:
            ValueError: If patterns is None or empty.
        """
        self._set_case_sensitivity(case_sensitive)

        # Update patterns list
        if patterns:
            self.patterns = ensure_list(patterns)

        # Define mode handlers
        mode_handlers = {
            "string": lambda: self._get_string_matches(patterns, self.flags),
            "phrase": lambda: self._get_phrase_matches(patterns, self.attr),
            "rule": lambda: self._get_rule_matches(patterns),
        }

        # If mode not provided or invalid, autodetect
        if not mode or mode not in mode_handlers:
            spans = self.get_matches(patterns, mode=self._autodetect_mode(patterns))
        # Get spans using appropriate handler
        else:
            spans = mode_handlers[mode]()
        return self._remove_duplicate_spans(spans)

    @validate_call(config=validation_config)
    def remove(self, patterns: Any, *, mode: Optional[str] = "string") -> None:
        """Remove patterns.

        Args:
            patterns (Any): The pattern(s) to match.
            mode (Optional[str]): The mode to use for matching.
        """
        patterns = ensure_list(patterns)
        spans = self.get_matches(patterns, mode=mode)

        # Create a set of spans to remove for faster lookup
        remove_spans = {f"{span.start},{span.end}" for span in spans}

        # Filter out the spans to be removed
        new_spans = [
            span
            for span in self.doc.spans["milestones"]
            if f"{span.start},{span.end}" not in remove_spans
        ]

        # Reset the token attributes for the spans to be removed
        for span in spans:
            for token in self.doc[span.start : span.end]:
                token._.milestone_iob = "O"
                token._.milestone_label = ""

        # Re-set the milestones with the remaining spans
        self.set_milestones(new_spans)

        # Remove the patterns from the object's patterns list
        self.patterns = [p for p in self.patterns if p not in patterns]

    def reset(self):
        """Reset all `milestone` values to defaults.

        Note: Does not modify patterns or any other settings.
        """
        self.doc.spans["milestones"] = []
        for i, _ in enumerate(self.doc):
            self.doc[i]._.milestone_iob = "O"
            self.doc[i]._.milestone_label = ""

    @validate_call(config=validation_config)
    def set_milestones(
        self,
        spans: list[Span],
        *,
        start: Optional[str | None] = None,
        remove: Optional[bool] = False,
        max_label_length: Optional[int] = 20,
    ) -> None:
        """Commit milestones to the object instance.

        Args:
            spans (list[Span]): The span(s) to use for identifying token attributes.
            start (Optional[str | None]): Set milestone start to the token before or after the milestone span. May be "before" or "after".
            remove (Optional[bool]): Set milestone start to the token following the milestone span and
                remove the milestone span tokens from the Doc.
            max_label_length (Optional[int]): The maximum number of characters to include in the label.
        """
        if start not in [None, "before", "after"]:
            raise ValueError("Start must be None, 'before', or 'after'.")
        if remove:
            self.doc = filter_doc(self.doc, spans)
        elif start is not None:
            # Update the doc's milestones
            self.doc.spans["milestones"] = move_milestone(self.doc, spans, start)
        else:
            self.doc.spans["milestones"] = spans
            self._assign_token_attributes(
                self.doc.spans["milestones"], max_label_length
            )
        self.type = "tokens"

    @validate_call(config=validation_config)
    def to_list(self, *, strip_punct: Optional[bool] = True) -> list[dict]:
        """Get a list of milestone dicts.

        Args:
            strip_punct (Optional[bool]): Strip single punctation mark at the end of the character string.

        Returns:
            list[dict]: A list of milestone dicts.

        Note:
            Some language models include a final punctuation mark in the token string,
            particularly at the end of a sentence. The strip_punct argument is a
            somewhat hacky convenience method to remove it. However, the user may wish
            instead to do some post-processing in order to use the output for their
            own purposes.
        """
        milestone_dicts = []
        for span in self.doc.spans["milestones"]:
            start_char = self.doc[span.start].idx
            end_char = start_char + len(span.text)
            chars = self.doc.text[start_char:end_char]
            if strip_punct:
                chars = chars.rstrip(punctuation)
                end_char -= 1
            milestone_dicts.append(
                {
                    "text": span.text,
                    "characters": chars,
                    "start_token": span.start,
                    "end_token": span.end,
                    "start_char": start_char,
                    "end_char": end_char,
                }
            )

        return milestone_dicts

spans: list[Span] property ¤

Return the Spans.

Returns:

Type Description
list[Span]

list[Span]: A list of spaCy Spans.

__init__(**data) -> None ¤

Set regex flags and milestone IOB extensions after initialization.

Source code in lexos/milestones/token_milestones.py
def __init__(self, **data) -> None:
    """Set regex flags and milestone IOB extensions after initialization."""
    super().__init__(**data)
    if not self.case_sensitive:
        self.flags = case_insensitive_flags
        self.attr = "LOWER"
    if not Token.has_extension("milestone_iob"):
        Token.set_extension("milestone_iob", default="O", force=True)
    if not Token.has_extension("milestone_label"):
        Token.set_extension("milestone_label", default="", force=True)

__iter__() -> Iterator ¤

Make the class iterable.

Returns:

Name Type Description
Iterator Iterator

A generator containing the object's spans.

Source code in lexos/milestones/token_milestones.py
def __iter__(self) -> Iterator:
    """Make the class iterable.

    Returns:
        Iterator: A generator containing the object's spans.
    """
    return (span for span in self.spans)

get_matches(patterns: Optional[Any] = None, mode: Optional[str] = None, case_sensitive: Optional[bool] = None) -> list[Span] ¤

Get matches to milestone patterns.

Parameters:

Name Type Description Default
patterns Optional[Any]

The pattern(s) to match.

None
mode Optional[str]

The mode to use for matching ('string', 'phrase', 'rule').

None
case_sensitive Optional[bool]

Whether to use case sensitive matching. Defaults to True.

None

Returns:

Type Description
list[Span]

list[Span]: A list of spaCy Spans matching the pattern.

Raises:

Type Description
ValueError

If patterns is None or empty.

Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def get_matches(
    self,
    patterns: Optional[Any] = None,
    mode: Optional[str] = None,
    case_sensitive: Optional[bool] = None,
) -> list[Span]:
    """Get matches to milestone patterns.

    Args:
        patterns (Optional[Any]): The pattern(s) to match.
        mode (Optional[str]): The mode to use for matching ('string', 'phrase', 'rule').
        case_sensitive (Optional[bool]): Whether to use case sensitive matching. Defaults to True.

    Returns:
        list[Span]: A list of spaCy Spans matching the pattern.

    Raises:
        ValueError: If patterns is None or empty.
    """
    self._set_case_sensitivity(case_sensitive)

    # Update patterns list
    if patterns:
        self.patterns = ensure_list(patterns)

    # Define mode handlers
    mode_handlers = {
        "string": lambda: self._get_string_matches(patterns, self.flags),
        "phrase": lambda: self._get_phrase_matches(patterns, self.attr),
        "rule": lambda: self._get_rule_matches(patterns),
    }

    # If mode not provided or invalid, autodetect
    if not mode or mode not in mode_handlers:
        spans = self.get_matches(patterns, mode=self._autodetect_mode(patterns))
    # Get spans using appropriate handler
    else:
        spans = mode_handlers[mode]()
    return self._remove_duplicate_spans(spans)

remove(patterns: Any, *, mode: Optional[str] = 'string') -> None ¤

Remove patterns.

Parameters:

Name Type Description Default
patterns Any

The pattern(s) to match.

required
mode Optional[str]

The mode to use for matching.

'string'
Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def remove(self, patterns: Any, *, mode: Optional[str] = "string") -> None:
    """Remove patterns.

    Args:
        patterns (Any): The pattern(s) to match.
        mode (Optional[str]): The mode to use for matching.
    """
    patterns = ensure_list(patterns)
    spans = self.get_matches(patterns, mode=mode)

    # Create a set of spans to remove for faster lookup
    remove_spans = {f"{span.start},{span.end}" for span in spans}

    # Filter out the spans to be removed
    new_spans = [
        span
        for span in self.doc.spans["milestones"]
        if f"{span.start},{span.end}" not in remove_spans
    ]

    # Reset the token attributes for the spans to be removed
    for span in spans:
        for token in self.doc[span.start : span.end]:
            token._.milestone_iob = "O"
            token._.milestone_label = ""

    # Re-set the milestones with the remaining spans
    self.set_milestones(new_spans)

    # Remove the patterns from the object's patterns list
    self.patterns = [p for p in self.patterns if p not in patterns]

reset() ¤

Reset all milestone values to defaults.

Note: Does not modify patterns or any other settings.

Source code in lexos/milestones/token_milestones.py
def reset(self):
    """Reset all `milestone` values to defaults.

    Note: Does not modify patterns or any other settings.
    """
    self.doc.spans["milestones"] = []
    for i, _ in enumerate(self.doc):
        self.doc[i]._.milestone_iob = "O"
        self.doc[i]._.milestone_label = ""

set_milestones(spans: list[Span], *, start: Optional[str | None] = None, remove: Optional[bool] = False, max_label_length: Optional[int] = 20) -> None ¤

Commit milestones to the object instance.

Parameters:

Name Type Description Default
spans list[Span]

The span(s) to use for identifying token attributes.

required
start Optional[str | None]

Set milestone start to the token before or after the milestone span. May be "before" or "after".

None
remove Optional[bool]

Set milestone start to the token following the milestone span and remove the milestone span tokens from the Doc.

False
max_label_length Optional[int]

The maximum number of characters to include in the label.

20
Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def set_milestones(
    self,
    spans: list[Span],
    *,
    start: Optional[str | None] = None,
    remove: Optional[bool] = False,
    max_label_length: Optional[int] = 20,
) -> None:
    """Commit milestones to the object instance.

    Args:
        spans (list[Span]): The span(s) to use for identifying token attributes.
        start (Optional[str | None]): Set milestone start to the token before or after the milestone span. May be "before" or "after".
        remove (Optional[bool]): Set milestone start to the token following the milestone span and
            remove the milestone span tokens from the Doc.
        max_label_length (Optional[int]): The maximum number of characters to include in the label.
    """
    if start not in [None, "before", "after"]:
        raise ValueError("Start must be None, 'before', or 'after'.")
    if remove:
        self.doc = filter_doc(self.doc, spans)
    elif start is not None:
        # Update the doc's milestones
        self.doc.spans["milestones"] = move_milestone(self.doc, spans, start)
    else:
        self.doc.spans["milestones"] = spans
        self._assign_token_attributes(
            self.doc.spans["milestones"], max_label_length
        )
    self.type = "tokens"

to_list(*, strip_punct: Optional[bool] = True) -> list[dict] ¤

Get a list of milestone dicts.

Parameters:

Name Type Description Default
strip_punct Optional[bool]

Strip single punctation mark at the end of the character string.

True

Returns:

Type Description
list[dict]

list[dict]: A list of milestone dicts.

Note

Some language models include a final punctuation mark in the token string, particularly at the end of a sentence. The strip_punct argument is a somewhat hacky convenience method to remove it. However, the user may wish instead to do some post-processing in order to use the output for their own purposes.

Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def to_list(self, *, strip_punct: Optional[bool] = True) -> list[dict]:
    """Get a list of milestone dicts.

    Args:
        strip_punct (Optional[bool]): Strip single punctation mark at the end of the character string.

    Returns:
        list[dict]: A list of milestone dicts.

    Note:
        Some language models include a final punctuation mark in the token string,
        particularly at the end of a sentence. The strip_punct argument is a
        somewhat hacky convenience method to remove it. However, the user may wish
        instead to do some post-processing in order to use the output for their
        own purposes.
    """
    milestone_dicts = []
    for span in self.doc.spans["milestones"]:
        start_char = self.doc[span.start].idx
        end_char = start_char + len(span.text)
        chars = self.doc.text[start_char:end_char]
        if strip_punct:
            chars = chars.rstrip(punctuation)
            end_char -= 1
        milestone_dicts.append(
            {
                "text": span.text,
                "characters": chars,
                "start_token": span.start,
                "end_token": span.end,
                "start_char": start_char,
                "end_char": end_char,
            }
        )

    return milestone_dicts

__init__(**data) -> None ¤

Set regex flags and milestone IOB extensions after initialization.

Source code in lexos/milestones/token_milestones.py
def __init__(self, **data) -> None:
    """Set regex flags and milestone IOB extensions after initialization."""
    super().__init__(**data)
    if not self.case_sensitive:
        self.flags = case_insensitive_flags
        self.attr = "LOWER"
    if not Token.has_extension("milestone_iob"):
        Token.set_extension("milestone_iob", default="O", force=True)
    if not Token.has_extension("milestone_label"):
        Token.set_extension("milestone_label", default="", force=True)

__iter__() -> Iterator ¤

Make the class iterable.

Returns:

Name Type Description
Iterator Iterator

A generator containing the object's spans.

Source code in lexos/milestones/token_milestones.py
def __iter__(self) -> Iterator:
    """Make the class iterable.

    Returns:
        Iterator: A generator containing the object's spans.
    """
    return (span for span in self.spans)

spans: list[Span] property ¤

Return the Spans.

Returns:

Type Description
list[Span]

list[Span]: A list of spaCy Spans.

_assign_token_attributes(spans: list[Span], max_label_length: int = 20) -> None ¤

Assign token attributes in the doc based on spans.

Parameters:

Name Type Description Default
spans list[Span]

A list of spaCy Spans.

required
max_label_length int

The maximum number of characters to include in the label.

20
Source code in lexos/milestones/token_milestones.py
def _assign_token_attributes(
    self, spans: list[Span], max_label_length: int = 20
) -> None:
    """Assign token attributes in the doc based on spans.

    Args:
        spans (list[Span]): A list of spaCy Spans.
        max_label_length (int): The maximum number of characters to include in the label.
    """
    # Early return if no spans
    if not spans:
        for token in self.doc:
            self.doc[token.i]._.milestone_iob = "O"
            self.doc[token.i]._.milestone_label = ""
        return

    # Pre-compute token positions and labels
    milestone_starts = {span.start: span for span in spans}
    milestone_ranges = {token.i for span in spans for token in span[1:]}

    # Assign attributes in single pass
    for token in self.doc:
        if span := milestone_starts.get(token.i):
            self.doc[token.i]._.milestone_iob = "B"
            self.doc[
                token.i
            ]._.milestone_label = f"{span.text:.{max_label_length}}{'...' if len(span.text) > max_label_length else ''}"
        elif token.i in milestone_ranges:
            self.doc[token.i]._.milestone_iob = "I"
            self.doc[token.i]._.milestone_label = ""
        else:
            self.doc[token.i]._.milestone_iob = "O"
            self.doc[token.i]._.milestone_label = ""

_autodetect_mode(patterns: str | list) -> str ¤

Autodetect mode for matching milestones if not supplied (experimental).

Parameters:

Name Type Description Default
patterns str | list

A pattern to match.

required

Returns:

Name Type Description
str str

A string to supply to the get_matches() mode argument.

Source code in lexos/milestones/token_milestones.py
def _autodetect_mode(self, patterns: str | list) -> str:
    """Autodetect mode for matching milestones if not supplied (experimental).

    Args:
        patterns (str | list): A pattern to match.

    Returns:
        str: A string to supply to the get_matches() mode argument.
    """
    for pattern in patterns:
        if not isinstance(pattern, (str, list)):
            raise ValueError(
                f"Pattern {pattern} must be a string or a spaCy Matcher rule."
            )
        if isinstance(pattern, str):
            if re.search(r"\s", pattern):
                self.mode = "phrase"
            else:
                self.mode = "string"
        else:
            try:
                matcher = Matcher(self.doc.vocab, validate=True)
                matcher.add("Pattern", [pattern])
                self.mode = "rule"
            # Raise an error if the pattern is not a valid Matcher pattern
            except BaseException:
                raise BaseException(
                    f"The pattern `{pattern}` could not be matched automatically. Check that the pattern is correct and try setting the `mode` argument in `get_matches()`."
                )
    return self.mode

_get_string_matches(patterns: Any, flags: Enum) -> list[Span] ¤

Get matches to milestone patterns.

Parameters:

Name Type Description Default
patterns Any

A pattern to match.

required
flags Enum

An enum of regex flags.

required

Returns:

Type Description
list[Span]

list[Span]: A list of Spans matching the pattern.

Source code in lexos/milestones/token_milestones.py
def _get_string_matches(self, patterns: Any, flags: Enum) -> list[Span]:
    """Get matches to milestone patterns.

    Args:
        patterns (Any): A pattern to match.
        flags (Enum): An enum of regex flags.

    Returns:
        list[Span]: A list of Spans matching the pattern.
    """
    if patterns is None or patterns == []:
        raise ValueError("Patterns cannot be empty")
    patterns = ensure_list(patterns)
    if self.character_map is None:
        self.character_map = chars_to_tokens(self.doc)
    pattern_matches = []
    for pattern in patterns:
        matches = re.finditer(pattern, self.doc.text, flags=flags)
        for match in matches:
            pattern_matches.append(match)
    return [self._to_spacy_span(match) for match in pattern_matches]

_get_phrase_matches(patterns: Any, attr: str = 'ORTH') -> list[Span] ¤

Get matches to milestone patterns in phrases.

Parameters:

Name Type Description Default
patterns Any

A pattern to match.

required
attr str

A spaCy Token attribute to search.

'ORTH'

Returns:

Type Description
list[Span]

list[Span]: A list of Spans matching the pattern.

Source code in lexos/milestones/token_milestones.py
def _get_phrase_matches(self, patterns: Any, attr: str = "ORTH") -> list[Span]:
    """Get matches to milestone patterns in phrases.

    Args:
        patterns (Any): A pattern to match.
        attr (str): A spaCy Token attribute to search.

    Returns:
        list[Span]: A list of Spans matching the pattern.
    """
    nlp = spacy.load(self.nlp)
    matcher = PhraseMatcher(self.doc.vocab, attr=attr)
    patterns = [nlp.make_doc(text) for text in patterns]
    matcher.add("PatternList", patterns)
    matches = matcher(self.doc)
    return [self.doc[start:end] for _, start, end in matches]

_get_rule_matches(patterns: Any) -> list[Span] ¤

Get matches to milestone patterns with spaCy rules.

Parameters:

Name Type Description Default
patterns Any

A pattern to match.

required

Returns:

Type Description
list[Span]

list[Span]: A list of Spans matching the pattern.

Source code in lexos/milestones/token_milestones.py
def _get_rule_matches(self, patterns: Any) -> list[Span]:
    """Get matches to milestone patterns with spaCy rules.

    Args:
        patterns (Any): A pattern to match.

    Returns:
        list[Span]: A list of Spans matching the pattern.
    """
    nlp = spacy.load(self.nlp)
    spans = []
    if not self.case_sensitive:
        patterns = lowercase_spacy_rules(patterns)
    for pattern in patterns:
        matcher = Matcher(nlp.vocab, validate=True)
        matcher.add("Pattern", [pattern])
        matches = matcher(self.doc)
        spans.extend([self.doc[start:end] for _, start, end in matches])
    return spans

_remove_duplicate_spans(spans: list[Span]) -> list[Span] ¤

Remove duplicate spans, generally created when a pattern is added.

Parameters:

Name Type Description Default
spans list[Span]

A list of Spans.

required

Returns:

Type Description
list[Span]

list[Span]: A list of de-duplicated Spans.

Source code in lexos/milestones/token_milestones.py
def _remove_duplicate_spans(self, spans: list[Span]) -> list[Span]:
    """Remove duplicate spans, generally created when a pattern is added.

    Args:
        spans (list[Span]): A list of Spans.

    Returns:
        list[Span]: A list of de-duplicated Spans.
    """
    result = []
    seen = set()
    for span in spans:
        key = (span.start, span.end)
        if key not in seen:
            result.append(span)
            seen.add(key)
    return result

_set_case_sensitivity(case_sensitive: Optional[bool] = None) -> None ¤

Set the object's case sensitivity.

Parameters:

Name Type Description Default
case_sensitive (optional, bool)

Whether or not to use case-sensitive searching.

None
Source code in lexos/milestones/token_milestones.py
def _set_case_sensitivity(self, case_sensitive: Optional[bool] = None) -> None:
    """Set the object's case sensitivity.

    Args:
        case_sensitive (optional, bool): Whether or not to use case-sensitive searching.
    """
    if case_sensitive is not None:
        self.case_sensitive = case_sensitive
    if self.case_sensitive is True:
        self.flags: Enum = re.DOTALL | re.MULTILINE | re.UNICODE
        self.attr = "ORTH"
    else:
        self.flags: Enum = re.DOTALL | re.IGNORECASE | re.MULTILINE | re.UNICODE
        self.attr = "LOWER"

_to_spacy_span(match: Match) -> Span ¤

Convert a re.match object to a Span.

Parameters:

Name Type Description Default
match Match

A re.match object.

required

Returns:

Name Type Description
Span Span

A spaCy Span.

Raises:

Type Description
ValueError

If match is None or span cannot be created.

Source code in lexos/milestones/token_milestones.py
def _to_spacy_span(self, match: Match) -> Span:
    """Convert a re.match object to a Span.

    Args:
        match (Match): A re.match object.

    Returns:
        Span: A spaCy Span.

    Raises:
        ValueError: If match is None or span cannot be created.
    """
    if not match:
        raise ValueError("Match object is None.")

    # Lazy load character map
    if not self.character_map:
        self.character_map = chars_to_tokens(self.doc)

    # Get character positions
    start_char, end_char = match.span()

    # Try direct char_span first
    if span := self.doc.char_span(start_char, end_char):
        return span

    # Fallback to character map
    start_token = self.character_map.get(start_char)
    end_token = self.character_map.get(end_char)

    if start_token is not None and end_token is not None:
        if span := self.doc[start_token : end_token + 1]:
            return span

    raise ValueError(
        f"Could not create span for match at positions {start_char}:{end_char}"
    )

get_matches(patterns: Optional[Any] = None, mode: Optional[str] = None, case_sensitive: Optional[bool] = None) -> list[Span] ¤

Get matches to milestone patterns.

Parameters:

Name Type Description Default
patterns Optional[Any]

The pattern(s) to match.

None
mode Optional[str]

The mode to use for matching ('string', 'phrase', 'rule').

None
case_sensitive Optional[bool]

Whether to use case sensitive matching. Defaults to True.

None

Returns:

Type Description
list[Span]

list[Span]: A list of spaCy Spans matching the pattern.

Raises:

Type Description
ValueError

If patterns is None or empty.

Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def get_matches(
    self,
    patterns: Optional[Any] = None,
    mode: Optional[str] = None,
    case_sensitive: Optional[bool] = None,
) -> list[Span]:
    """Get matches to milestone patterns.

    Args:
        patterns (Optional[Any]): The pattern(s) to match.
        mode (Optional[str]): The mode to use for matching ('string', 'phrase', 'rule').
        case_sensitive (Optional[bool]): Whether to use case sensitive matching. Defaults to True.

    Returns:
        list[Span]: A list of spaCy Spans matching the pattern.

    Raises:
        ValueError: If patterns is None or empty.
    """
    self._set_case_sensitivity(case_sensitive)

    # Update patterns list
    if patterns:
        self.patterns = ensure_list(patterns)

    # Define mode handlers
    mode_handlers = {
        "string": lambda: self._get_string_matches(patterns, self.flags),
        "phrase": lambda: self._get_phrase_matches(patterns, self.attr),
        "rule": lambda: self._get_rule_matches(patterns),
    }

    # If mode not provided or invalid, autodetect
    if not mode or mode not in mode_handlers:
        spans = self.get_matches(patterns, mode=self._autodetect_mode(patterns))
    # Get spans using appropriate handler
    else:
        spans = mode_handlers[mode]()
    return self._remove_duplicate_spans(spans)

remove(patterns: Any, *, mode: Optional[str] = 'string') -> None ¤

Remove patterns.

Parameters:

Name Type Description Default
patterns Any

The pattern(s) to match.

required
mode Optional[str]

The mode to use for matching.

'string'
Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def remove(self, patterns: Any, *, mode: Optional[str] = "string") -> None:
    """Remove patterns.

    Args:
        patterns (Any): The pattern(s) to match.
        mode (Optional[str]): The mode to use for matching.
    """
    patterns = ensure_list(patterns)
    spans = self.get_matches(patterns, mode=mode)

    # Create a set of spans to remove for faster lookup
    remove_spans = {f"{span.start},{span.end}" for span in spans}

    # Filter out the spans to be removed
    new_spans = [
        span
        for span in self.doc.spans["milestones"]
        if f"{span.start},{span.end}" not in remove_spans
    ]

    # Reset the token attributes for the spans to be removed
    for span in spans:
        for token in self.doc[span.start : span.end]:
            token._.milestone_iob = "O"
            token._.milestone_label = ""

    # Re-set the milestones with the remaining spans
    self.set_milestones(new_spans)

    # Remove the patterns from the object's patterns list
    self.patterns = [p for p in self.patterns if p not in patterns]

reset() ¤

Reset all milestone values to defaults.

Note: Does not modify patterns or any other settings.

Source code in lexos/milestones/token_milestones.py
def reset(self):
    """Reset all `milestone` values to defaults.

    Note: Does not modify patterns or any other settings.
    """
    self.doc.spans["milestones"] = []
    for i, _ in enumerate(self.doc):
        self.doc[i]._.milestone_iob = "O"
        self.doc[i]._.milestone_label = ""

set_milestones(spans: list[Span], *, start: Optional[str | None] = None, remove: Optional[bool] = False, max_label_length: Optional[int] = 20) -> None ¤

Commit milestones to the object instance.

Parameters:

Name Type Description Default
spans list[Span]

The span(s) to use for identifying token attributes.

required
start Optional[str | None]

Set milestone start to the token before or after the milestone span. May be "before" or "after".

None
remove Optional[bool]

Set milestone start to the token following the milestone span and remove the milestone span tokens from the Doc.

False
max_label_length Optional[int]

The maximum number of characters to include in the label.

20
Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def set_milestones(
    self,
    spans: list[Span],
    *,
    start: Optional[str | None] = None,
    remove: Optional[bool] = False,
    max_label_length: Optional[int] = 20,
) -> None:
    """Commit milestones to the object instance.

    Args:
        spans (list[Span]): The span(s) to use for identifying token attributes.
        start (Optional[str | None]): Set milestone start to the token before or after the milestone span. May be "before" or "after".
        remove (Optional[bool]): Set milestone start to the token following the milestone span and
            remove the milestone span tokens from the Doc.
        max_label_length (Optional[int]): The maximum number of characters to include in the label.
    """
    if start not in [None, "before", "after"]:
        raise ValueError("Start must be None, 'before', or 'after'.")
    if remove:
        self.doc = filter_doc(self.doc, spans)
    elif start is not None:
        # Update the doc's milestones
        self.doc.spans["milestones"] = move_milestone(self.doc, spans, start)
    else:
        self.doc.spans["milestones"] = spans
        self._assign_token_attributes(
            self.doc.spans["milestones"], max_label_length
        )
    self.type = "tokens"

to_list(*, strip_punct: Optional[bool] = True) -> list[dict] ¤

Get a list of milestone dicts.

Parameters:

Name Type Description Default
strip_punct Optional[bool]

Strip single punctation mark at the end of the character string.

True

Returns:

Type Description
list[dict]

list[dict]: A list of milestone dicts.

Note

Some language models include a final punctuation mark in the token string, particularly at the end of a sentence. The strip_punct argument is a somewhat hacky convenience method to remove it. However, the user may wish instead to do some post-processing in order to use the output for their own purposes.

Source code in lexos/milestones/token_milestones.py
@validate_call(config=validation_config)
def to_list(self, *, strip_punct: Optional[bool] = True) -> list[dict]:
    """Get a list of milestone dicts.

    Args:
        strip_punct (Optional[bool]): Strip single punctation mark at the end of the character string.

    Returns:
        list[dict]: A list of milestone dicts.

    Note:
        Some language models include a final punctuation mark in the token string,
        particularly at the end of a sentence. The strip_punct argument is a
        somewhat hacky convenience method to remove it. However, the user may wish
        instead to do some post-processing in order to use the output for their
        own purposes.
    """
    milestone_dicts = []
    for span in self.doc.spans["milestones"]:
        start_char = self.doc[span.start].idx
        end_char = start_char + len(span.text)
        chars = self.doc.text[start_char:end_char]
        if strip_punct:
            chars = chars.rstrip(punctuation)
            end_char -= 1
        milestone_dicts.append(
            {
                "text": span.text,
                "characters": chars,
                "start_token": span.start,
                "end_token": span.end,
                "start_char": start_char,
                "end_char": end_char,
            }
        )

    return milestone_dicts