Skip to content

record¤

Module Description¤

The record module provides the Record class, which is the building block for every document in your corpus. Each Record wraps your text (or a parsed spaCy Doc) and metadata and offers a suite of methods for serialization, statistics, and manipulation.

Record pydantic-model ¤

Bases: BaseModel

The main Record model.

Config:

  • arbitrary_types_allowed: True
  • validate_assignment: True
  • json_schema_extra: DocJSONSchema.schema()

Fields:

  • id (int | UUID4)
  • name (Optional[str])
  • is_active (Optional[bool])
  • content (Optional[Doc | str])
  • model (Optional[str])
  • extensions (list[str])
  • data_source (Optional[str])
  • meta (dict[str, Any])
Source code in lexos/corpus/record.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
class Record(BaseModel):
    """The main Record model."""

    id: int | UUID4 = uuid.uuid4()
    name: Optional[str] = None
    is_active: Optional[bool] = True
    content: Optional[Doc | str] = None
    model: Optional[str] = None
    extensions: list[str] = Field(default_factory=list)
    data_source: Optional[str] = None
    meta: dict[str, Any] = Field(default_factory=dict)

    model_config = ConfigDict(
        arbitrary_types_allowed=True,
        validate_assignment=True,
        json_schema_extra=DocJSONSchema.schema(),
    )

    @field_serializer("content")
    def serialize_content(self, content: Doc | str) -> bytes | str:
        """Serialize the content to bytes if it is a Doc object.

        Args:
            content (Doc | str): The content to serialize.

        Returns:
            bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
        """
        if isinstance(content, Doc):
            content.user_data["extensions"] = {}
            for ext in self.extensions:
                content.user_data["extensions"][ext] = [
                    token._.get(ext) for token in content
                ]
            return content.to_bytes()
        return content

    @field_serializer("id")
    def serialize_id(self, id, _info) -> str:
        """Always serialize ID as string for JSON compatibility.

        Args:
            id (UUID|int|str): The ID value being serialized.
            _info (Any): Encoder info (pydantic serializer internals).

        Returns:
            str: The serialized ID as a string.
        """
        return str(id)

    @field_serializer("meta")
    def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
        """Ensure metadata is JSON-serializable by converting special types to strings."""
        return self._sanitize_metadata(meta)

    def _sanitize_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
        """Convert non-JSON-serializable types to strings.

        Args:
            metadata: Original metadata dictionary

        Returns:
            Sanitized metadata dictionary with JSON-serializable values
        """
        sanitized = {}
        for key, value in metadata.items():
            if isinstance(value, UUID):
                sanitized[key] = str(value)
            elif isinstance(value, (datetime, date)):
                sanitized[key] = value.isoformat()
            elif isinstance(value, Path):
                sanitized[key] = str(value)
            elif isinstance(value, dict):
                sanitized[key] = self._sanitize_metadata(value)  # Recursive
            elif isinstance(value, list):
                sanitized[key] = [
                    self._sanitize_metadata({"item": item})["item"]
                    if isinstance(item, dict)
                    else str(item)
                    if isinstance(item, (UUID, datetime, date, Path))
                    else item
                    for item in value
                ]
            else:
                sanitized[key] = value

        return sanitized

    def __repr__(self):
        """Return a string representation of the record."""
        # We exclude `terms`, `text`, and `tokens` here because these are
        # computed / cached fields that can rely on the record being parsed.
        # For unparsed records, evaluating these computed properties will
        # raise a LexosException. `__repr__` should be lightweight and safe
        # to call in debugging contexts, so we exclude these computed fields
        # intentionally.
        fields = self.model_dump(exclude=["terms", "text", "tokens"])
        fields["is_parsed"] = str(self.is_parsed)
        if self.content and self.is_parsed:
            fields["content"] = f"{self.content.text[:25]}..."
        elif self.content and not self.is_parsed:
            fields["content"] = f"{self.content[:25]}..."
        else:
            fields["content"] = "None"
        field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
        return f"Record({', '.join(field_list)})"

    def __str__(self) -> str:
        """Return a user-friendly string representation of the record for printing."""
        active = "True" if self.is_active else "False"
        parsed = "True" if self.is_parsed else "False"

        # Get a preview of content
        if self.content is None:
            content_preview = "None"
        elif self.is_parsed:
            content_preview = f"'{self.content.text[:40]}...'"
        else:
            content_preview = f"'{self.content[:40]}...'"

        return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"

    @computed_field
    @cached_property
    def is_parsed(self) -> bool:
        """Return whether the record is parsed.

        Returns:
            bool: True if the record content is a spaCy Doc, False otherwise.
        """
        if isinstance(self.content, Doc):
            return True
        return False

    @computed_field
    @cached_property
    def preview(self) -> str:
        """Return a preview of the record text.

        Returns:
            str | None: A shortened preview of the record content, or None if content is None.
        """
        if self.content is None:
            return None

        if self.is_parsed:
            return f"{self.content.text[0:50]}..."
        return f"{self.content[0:500]}..."

    @computed_field
    @cached_property
    def terms(self) -> Counter:
        """Return the terms in the record.

        Returns:
            Counter: Collection mapping term -> count for the record.
        """
        if self.is_parsed:
            return Counter([t.text for t in self.content])
        else:
            raise LexosException("Record is not parsed.")

    @property
    def text(self) -> str:
        """Return the text of the record.

        Returns:
            str | None: The record text as string or None if no content is present.
        """
        if self.is_parsed:
            return self.content.text
        return self.content

    @cached_property
    def tokens(self) -> list[str]:
        """Return the tokens in the record.

        Returns:
            list[str]: A list of token strings extracted from the parsed content.
        """
        if self.is_parsed:
            return [t.text for t in self.content]
        else:
            raise LexosException("Record is not parsed.")

    def _doc_from_bytes(
        self,
        content: bytes,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
    ) -> Doc:
        """Convert bytes to a Doc object.

        Args:
            content (bytes): The bytes to convert.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

        Returns:
            Doc: The content as a Doc object.
        """
        # Create a Doc from the bytes
        vocab = self._get_vocab(model, model_cache)
        doc = Doc(vocab).from_bytes(content)

        # Restore extension values
        for ext, values in doc.user_data["extensions"].items():
            Token.set_extension(ext, default=None, force=True)
            for i in range(len(doc)):
                doc[i]._.set(ext, values[i])

        # Clean up user_data
        doc.user_data["extensions"] = list(doc.user_data["extensions"].keys())

        return doc

    # WARNING: This method is deprecated in favour of field serializer.
    def _doc_to_bytes(self) -> bytes:
        """Convert the content to bytes if it is a Doc object.

        Returns:
            bytes: The content as bytes.
        """
        if not isinstance(self.content, Doc):
            raise LexosException("Content is not a Doc object.")

        doc = self.content

        doc.user_data["extensions"] = {}
        for ext in self.extensions:
            doc.user_data["extensions"][ext] = [token._.get(ext) for token in doc]

        return doc.to_bytes()

    def _get_vocab(
        self, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None
    ) -> Vocab:
        """Get the vocabulary from the model or model cache.

        Args:
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

        Returns:
            Vocab: The vocabulary of the model.
        """
        if model_cache and not model:
            raise LexosException("Model cache provided but no model specified.")

        if model_cache:
            return model_cache.get_model(model).vocab
        elif model:
            return spacy.load(model).vocab
        elif self.model:
            return spacy.load(self.model).vocab
        else:
            raise LexosException(
                "No model specified for loading the Doc. Please provide a model name or a model cache."
            )

    @validate_call(config=model_config)
    def from_bytes(
        self,
        bytestring: bytes,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
        verify_hash: bool = True,
    ) -> None:
        """Deserialise the record from bytes.

        Args:
            bytestring (bytes): The bytes to load the record from.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
            verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
        """
        try:
            data = msgpack.unpackb(bytestring)
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize record: Invalid or corrupted data format. "
                f"Suggestion: Check if the file was completely written and not corrupted."
            ) from e

        # Verify data integrity if hash is present
        if verify_hash and "data_integrity_hash" in data:
            stored_hash = data["data_integrity_hash"]
            # Recreate hash from core data (excluding the hash itself)
            core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
            core_bytes = msgpack.dumps(core_data)
            computed_hash = hashlib.sha256(core_bytes).hexdigest()

            if stored_hash != computed_hash:
                raise LexosException(
                    f"Data integrity check failed: Hash mismatch detected. "
                    f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                    f"Suggestion: The data may be corrupted during storage or transmission. "
                    f"Try re-serializing the original document."
                )

        # Update the record with the loaded data
        for k, v in data.items():
            if k in self.model_fields:
                if k != "content":
                    setattr(self, k, v)

        # If content is bytes, convert it back to a Doc object
        if data["is_parsed"] and isinstance(data["content"], bytes):
            if not model:
                model = data.get("model")
            try:
                self.content = self._doc_from_bytes(data["content"], model, model_cache)
            except OSError as e:
                raise LexosException(
                    f"Failed to load spaCy model '{model}': {str(e)}. "
                    f"Suggestion: Install the model with 'python -m spacy download {model}' "
                    f"or use a different model available in your environment."
                ) from e
            except Exception as e:
                raise LexosException(
                    f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                    f"Suggestion: Check model compatibility - document may have been "
                    f"serialized with a different spaCy or model version."
                ) from e

    @validate_call(config=model_config)
    def from_disk(
        self,
        path: Path | str,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
    ) -> None:
        """Load the record from disk.

        Args:
            path (Path | str): The path to load the record from.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        """
        if not path:
            raise LexosException("No path specified for loading the record.")

        # Load the data from disk
        try:
            with open(path, "rb") as f:
                data = f.read()
        except FileNotFoundError as e:
            raise LexosException(
                f"Record file not found: {path}. "
                f"Suggestion: Check if the file path is correct and the file exists."
            ) from e
        except PermissionError as e:
            raise LexosException(
                f"Permission denied accessing record file: {path}. "
                f"Suggestion: Check file permissions or run with appropriate privileges."
            ) from e
        except IOError as e:
            raise LexosException(
                f"Failed to read record file: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e

        # Get the record content from the bytestring
        self.from_bytes(data, model=model, model_cache=model_cache)

    def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
        """Return the least common terms.

        Args:
            n (Optional[int]): The number of least common terms to return. If None, return all terms.

        Returns:
            list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
        """
        if self.is_parsed:
            return (
                sorted(self.terms.items(), key=lambda x: x[1])[:n]
                if n
                else sorted(self.terms.items(), key=lambda x: x[1])
            )
        else:
            raise LexosException("Record is not parsed.")

    def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
        """Return the most common terms.

        Args:
            n (Optional[int]): The number of most common terms to return. If None, return all terms.

        Returns:
            list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
        """
        if self.is_parsed:
            return self.terms.most_common(n)
        else:
            raise LexosException("Record is not parsed.")

    def num_terms(self) -> int:
        """Return the number of terms.

        Returns:
            int: The count of unique terms in this record.
        """
        if self.is_parsed:
            return len(self.terms)
        else:
            raise LexosException("Record is not parsed.")

    def num_tokens(self) -> int:
        """Return the number of tokens.

        Returns:
            int: The count of token elements in this record.
        """
        if self.is_parsed:
            return len(self.tokens)
        else:
            raise LexosException("Record is not parsed.")

    @validate_call(config=model_config)
    def set(self, **props: Any) -> None:
        """Set a record property.

        Args:
            **props (Any): A dict containing the properties to set on the record.

        Returns:
            None
        """
        for k, v in props.items():
            setattr(self, k, v)

    @validate_call(config=model_config)
    def to_bytes(
        self, extensions: Optional[list[str]] = [], include_hash: bool = True
    ) -> bytes:
        """Serialize the record to a dictionary.

        Args:
            extensions (list[str]): A list of extension names to include in the serialization.
            include_hash (bool): Whether to include data integrity hash. Defaults to True.

        Returns:
            bytes: The serialized record.
        """
        # Handle extensions
        if extensions:
            self.extensions = list(set(self.extensions + extensions))

        # Convert record to a dictionary
        # model_dump is used to create a serializable dict representation.
        # We exclude the computed fields (`terms`, `text`, `tokens`) because
        # they might trigger evaluation and raise `LexosException` for
        # unparsed `Record` objects. The saved content is handled below,
        # and `id` is stringified to ensure JSON compatibility.
        data = self.model_dump(exclude=["terms", "text", "tokens"])

        # Make UUID serialisable
        data["id"] = str(data["id"])

        # WARNING: This code is deprecated in favour of field serializer.
        # Convert the content to bytes if it is a Doc object
        if self.is_parsed:
            data["content"] = self._doc_to_bytes()

        # Add data integrity hash if requested
        if include_hash:
            # Create hash of the core data (excluding the hash itself)
            core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
            core_bytes = msgpack.dumps(core_data)
            data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

        return msgpack.dumps(data)

    @validate_call(config=model_config)
    def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
        """Save the record to disk.

        Args:
            path (Path | str): The path to save the record to.
            extensions (list[str]): A list of extension names to include in the serialization.
        """
        if not path:
            raise LexosException("No path specified for saving the record.")

        if not extensions:
            extensions = self.extensions

        # Serialize and save the record
        data = self.to_bytes(extensions)

        try:
            with open(path, "wb") as f:
                f.write(data)
        except PermissionError as e:
            raise LexosException(
                f"Permission denied writing to: {path}. "
                f"Suggestion: Check file/directory permissions or run with appropriate privileges."
            ) from e
        except OSError as e:
            if "No space left on device" in str(e):
                raise LexosException(
                    f"Insufficient disk space to save record: {path}. "
                    f"Suggestion: Free up disk space or choose a different location."
                ) from e
            else:
                raise LexosException(
                    f"Failed to write record to disk: {path}. Error: {str(e)}. "
                    f"Suggestion: Check disk space, file system health, or network connectivity."
                ) from e

    def vocab_density(self) -> float:
        """Return the vocabulary density.

        Returns:
            float: The vocabulary density of the record.
        """
        if self.is_parsed:
            return self.num_terms() / self.num_tokens()
        else:
            raise LexosException("Record is not parsed.")

is_parsed: bool cached property ¤

Return whether the record is parsed.

Returns:

Name Type Description
bool bool

True if the record content is a spaCy Doc, False otherwise.

preview: str cached property ¤

Return a preview of the record text.

Returns:

Type Description
str

str | None: A shortened preview of the record content, or None if content is None.

terms: Counter cached property ¤

Return the terms in the record.

Returns:

Name Type Description
Counter Counter

Collection mapping term -> count for the record.

text: str property ¤

Return the text of the record.

Returns:

Type Description
str

str | None: The record text as string or None if no content is present.

tokens: list[str] cached property ¤

Return the tokens in the record.

Returns:

Type Description
list[str]

list[str]: A list of token strings extracted from the parsed content.

__repr__() ¤

Return a string representation of the record.

Source code in lexos/corpus/record.py
def __repr__(self):
    """Return a string representation of the record."""
    # We exclude `terms`, `text`, and `tokens` here because these are
    # computed / cached fields that can rely on the record being parsed.
    # For unparsed records, evaluating these computed properties will
    # raise a LexosException. `__repr__` should be lightweight and safe
    # to call in debugging contexts, so we exclude these computed fields
    # intentionally.
    fields = self.model_dump(exclude=["terms", "text", "tokens"])
    fields["is_parsed"] = str(self.is_parsed)
    if self.content and self.is_parsed:
        fields["content"] = f"{self.content.text[:25]}..."
    elif self.content and not self.is_parsed:
        fields["content"] = f"{self.content[:25]}..."
    else:
        fields["content"] = "None"
    field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
    return f"Record({', '.join(field_list)})"

__str__() -> str ¤

Return a user-friendly string representation of the record for printing.

Source code in lexos/corpus/record.py
def __str__(self) -> str:
    """Return a user-friendly string representation of the record for printing."""
    active = "True" if self.is_active else "False"
    parsed = "True" if self.is_parsed else "False"

    # Get a preview of content
    if self.content is None:
        content_preview = "None"
    elif self.is_parsed:
        content_preview = f"'{self.content.text[:40]}...'"
    else:
        content_preview = f"'{self.content[:40]}...'"

    return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"

from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None ¤

Deserialise the record from bytes.

Parameters:

Name Type Description Default
bytestring bytes

The bytes to load the record from.

required
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None
verify_hash bool

Whether to verify data integrity hash. Defaults to True.

True
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def from_bytes(
    self,
    bytestring: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
    verify_hash: bool = True,
) -> None:
    """Deserialise the record from bytes.

    Args:
        bytestring (bytes): The bytes to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
    """
    try:
        data = msgpack.unpackb(bytestring)
    except Exception as e:
        raise LexosException(
            f"Failed to deserialize record: Invalid or corrupted data format. "
            f"Suggestion: Check if the file was completely written and not corrupted."
        ) from e

    # Verify data integrity if hash is present
    if verify_hash and "data_integrity_hash" in data:
        stored_hash = data["data_integrity_hash"]
        # Recreate hash from core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        computed_hash = hashlib.sha256(core_bytes).hexdigest()

        if stored_hash != computed_hash:
            raise LexosException(
                f"Data integrity check failed: Hash mismatch detected. "
                f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                f"Suggestion: The data may be corrupted during storage or transmission. "
                f"Try re-serializing the original document."
            )

    # Update the record with the loaded data
    for k, v in data.items():
        if k in self.model_fields:
            if k != "content":
                setattr(self, k, v)

    # If content is bytes, convert it back to a Doc object
    if data["is_parsed"] and isinstance(data["content"], bytes):
        if not model:
            model = data.get("model")
        try:
            self.content = self._doc_from_bytes(data["content"], model, model_cache)
        except OSError as e:
            raise LexosException(
                f"Failed to load spaCy model '{model}': {str(e)}. "
                f"Suggestion: Install the model with 'python -m spacy download {model}' "
                f"or use a different model available in your environment."
            ) from e
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                f"Suggestion: Check model compatibility - document may have been "
                f"serialized with a different spaCy or model version."
            ) from e

from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None ¤

Load the record from disk.

Parameters:

Name Type Description Default
path Path | str

The path to load the record from.

required
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def from_disk(
    self,
    path: Path | str,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> None:
    """Load the record from disk.

    Args:
        path (Path | str): The path to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
    """
    if not path:
        raise LexosException("No path specified for loading the record.")

    # Load the data from disk
    try:
        with open(path, "rb") as f:
            data = f.read()
    except FileNotFoundError as e:
        raise LexosException(
            f"Record file not found: {path}. "
            f"Suggestion: Check if the file path is correct and the file exists."
        ) from e
    except PermissionError as e:
        raise LexosException(
            f"Permission denied accessing record file: {path}. "
            f"Suggestion: Check file permissions or run with appropriate privileges."
        ) from e
    except IOError as e:
        raise LexosException(
            f"Failed to read record file: {path}. Error: {str(e)}. "
            f"Suggestion: Check disk space, file system health, or network connectivity."
        ) from e

    # Get the record content from the bytestring
    self.from_bytes(data, model=model, model_cache=model_cache)

least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

Return the least common terms.

Parameters:

Name Type Description Default
n Optional[int]

The number of least common terms to return. If None, return all terms.

None

Returns:

Type Description
list[tuple[str, int]]

list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.

Source code in lexos/corpus/record.py
def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the least common terms.

    Args:
        n (Optional[int]): The number of least common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
    """
    if self.is_parsed:
        return (
            sorted(self.terms.items(), key=lambda x: x[1])[:n]
            if n
            else sorted(self.terms.items(), key=lambda x: x[1])
        )
    else:
        raise LexosException("Record is not parsed.")

most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

Return the most common terms.

Parameters:

Name Type Description Default
n Optional[int]

The number of most common terms to return. If None, return all terms.

None

Returns:

Type Description
list[tuple[str, int]]

list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.

Source code in lexos/corpus/record.py
def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the most common terms.

    Args:
        n (Optional[int]): The number of most common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
    """
    if self.is_parsed:
        return self.terms.most_common(n)
    else:
        raise LexosException("Record is not parsed.")

num_terms() -> int ¤

Return the number of terms.

Returns:

Name Type Description
int int

The count of unique terms in this record.

Source code in lexos/corpus/record.py
def num_terms(self) -> int:
    """Return the number of terms.

    Returns:
        int: The count of unique terms in this record.
    """
    if self.is_parsed:
        return len(self.terms)
    else:
        raise LexosException("Record is not parsed.")

num_tokens() -> int ¤

Return the number of tokens.

Returns:

Name Type Description
int int

The count of token elements in this record.

Source code in lexos/corpus/record.py
def num_tokens(self) -> int:
    """Return the number of tokens.

    Returns:
        int: The count of token elements in this record.
    """
    if self.is_parsed:
        return len(self.tokens)
    else:
        raise LexosException("Record is not parsed.")

serialize_content(content: Doc | str) -> bytes | str ¤

Serialize the content to bytes if it is a Doc object.

Parameters:

Name Type Description Default
content Doc | str

The content to serialize.

required

Returns:

Type Description
bytes | str

bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.

Source code in lexos/corpus/record.py
@field_serializer("content")
def serialize_content(self, content: Doc | str) -> bytes | str:
    """Serialize the content to bytes if it is a Doc object.

    Args:
        content (Doc | str): The content to serialize.

    Returns:
        bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
    """
    if isinstance(content, Doc):
        content.user_data["extensions"] = {}
        for ext in self.extensions:
            content.user_data["extensions"][ext] = [
                token._.get(ext) for token in content
            ]
        return content.to_bytes()
    return content

serialize_id(id, _info) -> str ¤

Always serialize ID as string for JSON compatibility.

Parameters:

Name Type Description Default
id UUID | int | str

The ID value being serialized.

required
_info Any

Encoder info (pydantic serializer internals).

required

Returns:

Name Type Description
str str

The serialized ID as a string.

Source code in lexos/corpus/record.py
@field_serializer("id")
def serialize_id(self, id, _info) -> str:
    """Always serialize ID as string for JSON compatibility.

    Args:
        id (UUID|int|str): The ID value being serialized.
        _info (Any): Encoder info (pydantic serializer internals).

    Returns:
        str: The serialized ID as a string.
    """
    return str(id)

serialize_meta(meta: dict[str, Any]) -> dict[str, Any] ¤

Ensure metadata is JSON-serializable by converting special types to strings.

Source code in lexos/corpus/record.py
@field_serializer("meta")
def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
    """Ensure metadata is JSON-serializable by converting special types to strings."""
    return self._sanitize_metadata(meta)

set(**props: Any) -> None ¤

Set a record property.

Parameters:

Name Type Description Default
**props Any

A dict containing the properties to set on the record.

{}

Returns:

Type Description
None

None

Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def set(self, **props: Any) -> None:
    """Set a record property.

    Args:
        **props (Any): A dict containing the properties to set on the record.

    Returns:
        None
    """
    for k, v in props.items():
        setattr(self, k, v)

to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes ¤

Serialize the record to a dictionary.

Parameters:

Name Type Description Default
extensions list[str]

A list of extension names to include in the serialization.

[]
include_hash bool

Whether to include data integrity hash. Defaults to True.

True

Returns:

Name Type Description
bytes bytes

The serialized record.

Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def to_bytes(
    self, extensions: Optional[list[str]] = [], include_hash: bool = True
) -> bytes:
    """Serialize the record to a dictionary.

    Args:
        extensions (list[str]): A list of extension names to include in the serialization.
        include_hash (bool): Whether to include data integrity hash. Defaults to True.

    Returns:
        bytes: The serialized record.
    """
    # Handle extensions
    if extensions:
        self.extensions = list(set(self.extensions + extensions))

    # Convert record to a dictionary
    # model_dump is used to create a serializable dict representation.
    # We exclude the computed fields (`terms`, `text`, `tokens`) because
    # they might trigger evaluation and raise `LexosException` for
    # unparsed `Record` objects. The saved content is handled below,
    # and `id` is stringified to ensure JSON compatibility.
    data = self.model_dump(exclude=["terms", "text", "tokens"])

    # Make UUID serialisable
    data["id"] = str(data["id"])

    # WARNING: This code is deprecated in favour of field serializer.
    # Convert the content to bytes if it is a Doc object
    if self.is_parsed:
        data["content"] = self._doc_to_bytes()

    # Add data integrity hash if requested
    if include_hash:
        # Create hash of the core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

    return msgpack.dumps(data)

to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None ¤

Save the record to disk.

Parameters:

Name Type Description Default
path Path | str

The path to save the record to.

required
extensions list[str]

A list of extension names to include in the serialization.

None
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
    """Save the record to disk.

    Args:
        path (Path | str): The path to save the record to.
        extensions (list[str]): A list of extension names to include in the serialization.
    """
    if not path:
        raise LexosException("No path specified for saving the record.")

    if not extensions:
        extensions = self.extensions

    # Serialize and save the record
    data = self.to_bytes(extensions)

    try:
        with open(path, "wb") as f:
            f.write(data)
    except PermissionError as e:
        raise LexosException(
            f"Permission denied writing to: {path}. "
            f"Suggestion: Check file/directory permissions or run with appropriate privileges."
        ) from e
    except OSError as e:
        if "No space left on device" in str(e):
            raise LexosException(
                f"Insufficient disk space to save record: {path}. "
                f"Suggestion: Free up disk space or choose a different location."
            ) from e
        else:
            raise LexosException(
                f"Failed to write record to disk: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e

vocab_density() -> float ¤

Return the vocabulary density.

Returns:

Name Type Description
float float

The vocabulary density of the record.

Source code in lexos/corpus/record.py
def vocab_density(self) -> float:
    """Return the vocabulary density.

    Returns:
        float: The vocabulary density of the record.
    """
    if self.is_parsed:
        return self.num_terms() / self.num_tokens()
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3

serialize_content(content: Doc | str) -> bytes | str ¤

Serialize the content to bytes if it is a Doc object.

Parameters:

Name Type Description Default
content Doc | str

The content to serialize.

required

Returns:

Type Description
bytes | str

bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.

Source code in lexos/corpus/record.py
@field_serializer("content")
def serialize_content(self, content: Doc | str) -> bytes | str:
    """Serialize the content to bytes if it is a Doc object.

    Args:
        content (Doc | str): The content to serialize.

    Returns:
        bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
    """
    if isinstance(content, Doc):
        content.user_data["extensions"] = {}
        for ext in self.extensions:
            content.user_data["extensions"][ext] = [
                token._.get(ext) for token in content
            ]
        return content.to_bytes()
    return content
rendering:
  show_root_heading: true
  heading_level: 3

serialize_id(id, _info) -> str ¤

Always serialize ID as string for JSON compatibility.

Parameters:

Name Type Description Default
id UUID | int | str

The ID value being serialized.

required
_info Any

Encoder info (pydantic serializer internals).

required

Returns:

Name Type Description
str str

The serialized ID as a string.

Source code in lexos/corpus/record.py
@field_serializer("id")
def serialize_id(self, id, _info) -> str:
    """Always serialize ID as string for JSON compatibility.

    Args:
        id (UUID|int|str): The ID value being serialized.
        _info (Any): Encoder info (pydantic serializer internals).

    Returns:
        str: The serialized ID as a string.
    """
    return str(id)
rendering:
  show_root_heading: true
  heading_level: 3

serialize_meta(meta: dict[str, Any]) -> dict[str, Any] ¤

Ensure metadata is JSON-serializable by converting special types to strings.

Source code in lexos/corpus/record.py
@field_serializer("meta")
def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
    """Ensure metadata is JSON-serializable by converting special types to strings."""
    return self._sanitize_metadata(meta)
rendering:
  show_root_heading: true
  heading_level: 3

_sanitize_metadata(metadata: dict[str, Any]) -> dict[str, Any] ¤

Convert non-JSON-serializable types to strings.

Parameters:

Name Type Description Default
metadata dict[str, Any]

Original metadata dictionary

required

Returns:

Type Description
dict[str, Any]

Sanitized metadata dictionary with JSON-serializable values

Source code in lexos/corpus/record.py
def _sanitize_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
    """Convert non-JSON-serializable types to strings.

    Args:
        metadata: Original metadata dictionary

    Returns:
        Sanitized metadata dictionary with JSON-serializable values
    """
    sanitized = {}
    for key, value in metadata.items():
        if isinstance(value, UUID):
            sanitized[key] = str(value)
        elif isinstance(value, (datetime, date)):
            sanitized[key] = value.isoformat()
        elif isinstance(value, Path):
            sanitized[key] = str(value)
        elif isinstance(value, dict):
            sanitized[key] = self._sanitize_metadata(value)  # Recursive
        elif isinstance(value, list):
            sanitized[key] = [
                self._sanitize_metadata({"item": item})["item"]
                if isinstance(item, dict)
                else str(item)
                if isinstance(item, (UUID, datetime, date, Path))
                else item
                for item in value
            ]
        else:
            sanitized[key] = value

    return sanitized
rendering:
  show_root_heading: true
  heading_level: 3

__repr__() ¤

Return a string representation of the record.

Source code in lexos/corpus/record.py
def __repr__(self):
    """Return a string representation of the record."""
    # We exclude `terms`, `text`, and `tokens` here because these are
    # computed / cached fields that can rely on the record being parsed.
    # For unparsed records, evaluating these computed properties will
    # raise a LexosException. `__repr__` should be lightweight and safe
    # to call in debugging contexts, so we exclude these computed fields
    # intentionally.
    fields = self.model_dump(exclude=["terms", "text", "tokens"])
    fields["is_parsed"] = str(self.is_parsed)
    if self.content and self.is_parsed:
        fields["content"] = f"{self.content.text[:25]}..."
    elif self.content and not self.is_parsed:
        fields["content"] = f"{self.content[:25]}..."
    else:
        fields["content"] = "None"
    field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
    return f"Record({', '.join(field_list)})"
rendering:
  show_root_heading: true
  heading_level: 3

__str__() -> str ¤

Return a user-friendly string representation of the record for printing.

Source code in lexos/corpus/record.py
def __str__(self) -> str:
    """Return a user-friendly string representation of the record for printing."""
    active = "True" if self.is_active else "False"
    parsed = "True" if self.is_parsed else "False"

    # Get a preview of content
    if self.content is None:
        content_preview = "None"
    elif self.is_parsed:
        content_preview = f"'{self.content.text[:40]}...'"
    else:
        content_preview = f"'{self.content[:40]}...'"

    return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"
rendering:
  show_root_heading: true
  heading_level: 3

is_parsed: bool cached property ¤

Return whether the record is parsed.

Returns:

Name Type Description
bool bool

True if the record content is a spaCy Doc, False otherwise.

rendering:
  show_root_heading: true
  heading_level: 3

preview: str cached property ¤

Return a preview of the record text.

Returns:

Type Description
str

str | None: A shortened preview of the record content, or None if content is None.

rendering:
  show_root_heading: true
  heading_level: 3

terms: Counter cached property ¤

Return the terms in the record.

Returns:

Name Type Description
Counter Counter

Collection mapping term -> count for the record.

rendering:
  show_root_heading: true
  heading_level: 3

text: str property ¤

Return the text of the record.

Returns:

Type Description
str

str | None: The record text as string or None if no content is present.

rendering:
  show_root_heading: true
  heading_level: 3

tokens: list[str] cached property ¤

Return the tokens in the record.

Returns:

Type Description
list[str]

list[str]: A list of token strings extracted from the parsed content.

rendering:
  show_root_heading: true
  heading_level: 3

_doc_from_bytes(content: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Doc ¤

Convert bytes to a Doc object.

Parameters:

Name Type Description Default
content bytes

The bytes to convert.

required
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None

Returns:

Name Type Description
Doc Doc

The content as a Doc object.

Source code in lexos/corpus/record.py
def _doc_from_bytes(
    self,
    content: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> Doc:
    """Convert bytes to a Doc object.

    Args:
        content (bytes): The bytes to convert.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

    Returns:
        Doc: The content as a Doc object.
    """
    # Create a Doc from the bytes
    vocab = self._get_vocab(model, model_cache)
    doc = Doc(vocab).from_bytes(content)

    # Restore extension values
    for ext, values in doc.user_data["extensions"].items():
        Token.set_extension(ext, default=None, force=True)
        for i in range(len(doc)):
            doc[i]._.set(ext, values[i])

    # Clean up user_data
    doc.user_data["extensions"] = list(doc.user_data["extensions"].keys())

    return doc
rendering:
  show_root_heading: true
  heading_level: 3

_doc_to_bytes() -> bytes ¤

Convert the content to bytes if it is a Doc object.

Returns:

Name Type Description
bytes bytes

The content as bytes.

Source code in lexos/corpus/record.py
def _doc_to_bytes(self) -> bytes:
    """Convert the content to bytes if it is a Doc object.

    Returns:
        bytes: The content as bytes.
    """
    if not isinstance(self.content, Doc):
        raise LexosException("Content is not a Doc object.")

    doc = self.content

    doc.user_data["extensions"] = {}
    for ext in self.extensions:
        doc.user_data["extensions"][ext] = [token._.get(ext) for token in doc]

    return doc.to_bytes()
rendering:
  show_root_heading: true
  heading_level: 3

_get_vocab(model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Vocab ¤

Get the vocabulary from the model or model cache.

Parameters:

Name Type Description Default
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None

Returns:

Name Type Description
Vocab Vocab

The vocabulary of the model.

Source code in lexos/corpus/record.py
def _get_vocab(
    self, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None
) -> Vocab:
    """Get the vocabulary from the model or model cache.

    Args:
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

    Returns:
        Vocab: The vocabulary of the model.
    """
    if model_cache and not model:
        raise LexosException("Model cache provided but no model specified.")

    if model_cache:
        return model_cache.get_model(model).vocab
    elif model:
        return spacy.load(model).vocab
    elif self.model:
        return spacy.load(self.model).vocab
    else:
        raise LexosException(
            "No model specified for loading the Doc. Please provide a model name or a model cache."
        )
rendering:
  show_root_heading: true
  heading_level: 3

from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None ¤

Deserialise the record from bytes.

Parameters:

Name Type Description Default
bytestring bytes

The bytes to load the record from.

required
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None
verify_hash bool

Whether to verify data integrity hash. Defaults to True.

True
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def from_bytes(
    self,
    bytestring: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
    verify_hash: bool = True,
) -> None:
    """Deserialise the record from bytes.

    Args:
        bytestring (bytes): The bytes to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
    """
    try:
        data = msgpack.unpackb(bytestring)
    except Exception as e:
        raise LexosException(
            f"Failed to deserialize record: Invalid or corrupted data format. "
            f"Suggestion: Check if the file was completely written and not corrupted."
        ) from e

    # Verify data integrity if hash is present
    if verify_hash and "data_integrity_hash" in data:
        stored_hash = data["data_integrity_hash"]
        # Recreate hash from core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        computed_hash = hashlib.sha256(core_bytes).hexdigest()

        if stored_hash != computed_hash:
            raise LexosException(
                f"Data integrity check failed: Hash mismatch detected. "
                f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                f"Suggestion: The data may be corrupted during storage or transmission. "
                f"Try re-serializing the original document."
            )

    # Update the record with the loaded data
    for k, v in data.items():
        if k in self.model_fields:
            if k != "content":
                setattr(self, k, v)

    # If content is bytes, convert it back to a Doc object
    if data["is_parsed"] and isinstance(data["content"], bytes):
        if not model:
            model = data.get("model")
        try:
            self.content = self._doc_from_bytes(data["content"], model, model_cache)
        except OSError as e:
            raise LexosException(
                f"Failed to load spaCy model '{model}': {str(e)}. "
                f"Suggestion: Install the model with 'python -m spacy download {model}' "
                f"or use a different model available in your environment."
            ) from e
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                f"Suggestion: Check model compatibility - document may have been "
                f"serialized with a different spaCy or model version."
            ) from e
rendering:
  show_root_heading: true
  heading_level: 3

from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None ¤

Load the record from disk.

Parameters:

Name Type Description Default
path Path | str

The path to load the record from.

required
model Optional[str]

The spaCy model to use for loading the Doc.

None
model_cache Optional[LexosModelCache]

An optional cache for spaCy models.

None
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def from_disk(
    self,
    path: Path | str,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> None:
    """Load the record from disk.

    Args:
        path (Path | str): The path to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
    """
    if not path:
        raise LexosException("No path specified for loading the record.")

    # Load the data from disk
    try:
        with open(path, "rb") as f:
            data = f.read()
    except FileNotFoundError as e:
        raise LexosException(
            f"Record file not found: {path}. "
            f"Suggestion: Check if the file path is correct and the file exists."
        ) from e
    except PermissionError as e:
        raise LexosException(
            f"Permission denied accessing record file: {path}. "
            f"Suggestion: Check file permissions or run with appropriate privileges."
        ) from e
    except IOError as e:
        raise LexosException(
            f"Failed to read record file: {path}. Error: {str(e)}. "
            f"Suggestion: Check disk space, file system health, or network connectivity."
        ) from e

    # Get the record content from the bytestring
    self.from_bytes(data, model=model, model_cache=model_cache)
rendering:
  show_root_heading: true
  heading_level: 3

least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

Return the least common terms.

Parameters:

Name Type Description Default
n Optional[int]

The number of least common terms to return. If None, return all terms.

None

Returns:

Type Description
list[tuple[str, int]]

list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.

Source code in lexos/corpus/record.py
def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the least common terms.

    Args:
        n (Optional[int]): The number of least common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
    """
    if self.is_parsed:
        return (
            sorted(self.terms.items(), key=lambda x: x[1])[:n]
            if n
            else sorted(self.terms.items(), key=lambda x: x[1])
        )
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3

most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

Return the most common terms.

Parameters:

Name Type Description Default
n Optional[int]

The number of most common terms to return. If None, return all terms.

None

Returns:

Type Description
list[tuple[str, int]]

list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.

Source code in lexos/corpus/record.py
def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the most common terms.

    Args:
        n (Optional[int]): The number of most common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
    """
    if self.is_parsed:
        return self.terms.most_common(n)
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3

num_terms() -> int ¤

Return the number of terms.

Returns:

Name Type Description
int int

The count of unique terms in this record.

Source code in lexos/corpus/record.py
def num_terms(self) -> int:
    """Return the number of terms.

    Returns:
        int: The count of unique terms in this record.
    """
    if self.is_parsed:
        return len(self.terms)
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3

num_tokens() -> int ¤

Return the number of tokens.

Returns:

Name Type Description
int int

The count of token elements in this record.

Source code in lexos/corpus/record.py
def num_tokens(self) -> int:
    """Return the number of tokens.

    Returns:
        int: The count of token elements in this record.
    """
    if self.is_parsed:
        return len(self.tokens)
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3

set(**props: Any) -> None ¤

Set a record property.

Parameters:

Name Type Description Default
**props Any

A dict containing the properties to set on the record.

{}

Returns:

Type Description
None

None

Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def set(self, **props: Any) -> None:
    """Set a record property.

    Args:
        **props (Any): A dict containing the properties to set on the record.

    Returns:
        None
    """
    for k, v in props.items():
        setattr(self, k, v)
rendering:
  show_root_heading: true
  heading_level: 3

to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes ¤

Serialize the record to a dictionary.

Parameters:

Name Type Description Default
extensions list[str]

A list of extension names to include in the serialization.

[]
include_hash bool

Whether to include data integrity hash. Defaults to True.

True

Returns:

Name Type Description
bytes bytes

The serialized record.

Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def to_bytes(
    self, extensions: Optional[list[str]] = [], include_hash: bool = True
) -> bytes:
    """Serialize the record to a dictionary.

    Args:
        extensions (list[str]): A list of extension names to include in the serialization.
        include_hash (bool): Whether to include data integrity hash. Defaults to True.

    Returns:
        bytes: The serialized record.
    """
    # Handle extensions
    if extensions:
        self.extensions = list(set(self.extensions + extensions))

    # Convert record to a dictionary
    # model_dump is used to create a serializable dict representation.
    # We exclude the computed fields (`terms`, `text`, `tokens`) because
    # they might trigger evaluation and raise `LexosException` for
    # unparsed `Record` objects. The saved content is handled below,
    # and `id` is stringified to ensure JSON compatibility.
    data = self.model_dump(exclude=["terms", "text", "tokens"])

    # Make UUID serialisable
    data["id"] = str(data["id"])

    # WARNING: This code is deprecated in favour of field serializer.
    # Convert the content to bytes if it is a Doc object
    if self.is_parsed:
        data["content"] = self._doc_to_bytes()

    # Add data integrity hash if requested
    if include_hash:
        # Create hash of the core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

    return msgpack.dumps(data)
rendering:
  show_root_heading: true
  heading_level: 3

to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None ¤

Save the record to disk.

Parameters:

Name Type Description Default
path Path | str

The path to save the record to.

required
extensions list[str]

A list of extension names to include in the serialization.

None
Source code in lexos/corpus/record.py
@validate_call(config=model_config)
def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
    """Save the record to disk.

    Args:
        path (Path | str): The path to save the record to.
        extensions (list[str]): A list of extension names to include in the serialization.
    """
    if not path:
        raise LexosException("No path specified for saving the record.")

    if not extensions:
        extensions = self.extensions

    # Serialize and save the record
    data = self.to_bytes(extensions)

    try:
        with open(path, "wb") as f:
            f.write(data)
    except PermissionError as e:
        raise LexosException(
            f"Permission denied writing to: {path}. "
            f"Suggestion: Check file/directory permissions or run with appropriate privileges."
        ) from e
    except OSError as e:
        if "No space left on device" in str(e):
            raise LexosException(
                f"Insufficient disk space to save record: {path}. "
                f"Suggestion: Free up disk space or choose a different location."
            ) from e
        else:
            raise LexosException(
                f"Failed to write record to disk: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e
rendering:
  show_root_heading: true
  heading_level: 3

vocab_density() -> float ¤

Return the vocabulary density.

Returns:

Name Type Description
float float

The vocabulary density of the record.

Source code in lexos/corpus/record.py
def vocab_density(self) -> float:
    """Return the vocabulary density.

    Returns:
        float: The vocabulary density of the record.
    """
    if self.is_parsed:
        return self.num_terms() / self.num_tokens()
    else:
        raise LexosException("Record is not parsed.")
rendering:
  show_root_heading: true
  heading_level: 3