record¤

Module Description¤

The record module provides the Record class, which is the building block for every document in your corpus. Each Record wraps your text (or a parsed spaCy Doc) and metadata and offers a suite of methods for serialization, statistics, and manipulation.

`Record` `pydantic-model` ¤

Bases: BaseModel

The main Record model.

Config:

arbitrary_types_allowed: True
validate_assignment: True
json_schema_extra: DocJSONSchema.schema()

Fields:

id (int | UUID4)
name (Optional[str])
is_active (Optional[bool])
content (Optional[Doc | str])
model (Optional[str])
extensions (list[str])
data_source (Optional[str])
meta (dict[str, Any])

Source code in lexos/corpus/record.py

class Record(BaseModel):
    """The main Record model."""

    id: int | UUID4 = uuid.uuid4()
    name: Optional[str] = None
    is_active: Optional[bool] = True
    content: Optional[Doc | str] = None
    model: Optional[str] = None
    extensions: list[str] = Field(default_factory=list)
    data_source: Optional[str] = None
    meta: dict[str, Any] = Field(default_factory=dict)

    model_config = ConfigDict(
        arbitrary_types_allowed=True,
        validate_assignment=True,
        json_schema_extra=DocJSONSchema.schema(),
    )

    @field_serializer("content")
    def serialize_content(self, content: Doc | str) -> bytes | str:
        """Serialize the content to bytes if it is a Doc object.

        Args:
            content (Doc | str): The content to serialize.

        Returns:
            bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
        """
        if isinstance(content, Doc):
            content.user_data["extensions"] = {}
            for ext in self.extensions:
                content.user_data["extensions"][ext] = [
                    token._.get(ext) for token in content
                ]
            return content.to_bytes()
        return content

    @field_serializer("id")
    def serialize_id(self, id, _info) -> str:
        """Always serialize ID as string for JSON compatibility.

        Args:
            id (UUID|int|str): The ID value being serialized.
            _info (Any): Encoder info (pydantic serializer internals).

        Returns:
            str: The serialized ID as a string.
        """
        return str(id)

    @field_serializer("meta")
    def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
        """Ensure metadata is JSON-serializable by converting special types to strings."""
        return self._sanitize_metadata(meta)

    def _sanitize_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
        """Convert non-JSON-serializable types to strings.

        Args:
            metadata: Original metadata dictionary

        Returns:
            Sanitized metadata dictionary with JSON-serializable values
        """
        sanitized = {}
        for key, value in metadata.items():
            if isinstance(value, UUID):
                sanitized[key] = str(value)
            elif isinstance(value, (datetime, date)):
                sanitized[key] = value.isoformat()
            elif isinstance(value, Path):
                sanitized[key] = str(value)
            elif isinstance(value, dict):
                sanitized[key] = self._sanitize_metadata(value)  # Recursive
            elif isinstance(value, list):
                sanitized[key] = [
                    self._sanitize_metadata({"item": item})["item"]
                    if isinstance(item, dict)
                    else str(item)
                    if isinstance(item, (UUID, datetime, date, Path))
                    else item
                    for item in value
                ]
            else:
                sanitized[key] = value

        return sanitized

    def __repr__(self):
        """Return a string representation of the record."""
        # We exclude `terms`, `text`, and `tokens` here because these are
        # computed / cached fields that can rely on the record being parsed.
        # For unparsed records, evaluating these computed properties will
        # raise a LexosException. `__repr__` should be lightweight and safe
        # to call in debugging contexts, so we exclude these computed fields
        # intentionally.
        fields = self.model_dump(exclude=["terms", "text", "tokens"])
        fields["is_parsed"] = str(self.is_parsed)
        if self.content and self.is_parsed:
            fields["content"] = f"{self.content.text[:25]}..."
        elif self.content and not self.is_parsed:
            fields["content"] = f"{self.content[:25]}..."
        else:
            fields["content"] = "None"
        field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
        return f"Record({', '.join(field_list)})"

    def __str__(self) -> str:
        """Return a user-friendly string representation of the record for printing."""
        active = "True" if self.is_active else "False"
        parsed = "True" if self.is_parsed else "False"

        # Get a preview of content
        if self.content is None:
            content_preview = "None"
        elif self.is_parsed:
            content_preview = f"'{self.content.text[:40]}...'"
        else:
            content_preview = f"'{self.content[:40]}...'"

        return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"

    @computed_field
    @cached_property
    def is_parsed(self) -> bool:
        """Return whether the record is parsed.

        Returns:
            bool: True if the record content is a spaCy Doc, False otherwise.
        """
        if isinstance(self.content, Doc):
            return True
        return False

    @computed_field
    @cached_property
    def preview(self) -> str:
        """Return a preview of the record text.

        Returns:
            str | None: A shortened preview of the record content, or None if content is None.
        """
        if self.content is None:
            return None

        if self.is_parsed:
            return f"{self.content.text[0:50]}..."
        return f"{self.content[0:500]}..."

    @computed_field
    @cached_property
    def terms(self) -> Counter:
        """Return the terms in the record.

        Returns:
            Counter: Collection mapping term -> count for the record.
        """
        if self.is_parsed:
            return Counter([t.text for t in self.content])
        else:
            raise LexosException("Record is not parsed.")

    @property
    def text(self) -> str:
        """Return the text of the record.

        Returns:
            str | None: The record text as string or None if no content is present.
        """
        if self.is_parsed:
            return self.content.text
        return self.content

    @cached_property
    def tokens(self) -> list[str]:
        """Return the tokens in the record.

        Returns:
            list[str]: A list of token strings extracted from the parsed content.
        """
        if self.is_parsed:
            return [t.text for t in self.content]
        else:
            raise LexosException("Record is not parsed.")

    def _doc_from_bytes(
        self,
        content: bytes,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
    ) -> Doc:
        """Convert bytes to a Doc object.

        Args:
            content (bytes): The bytes to convert.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

        Returns:
            Doc: The content as a Doc object.
        """
        # Create a Doc from the bytes
        vocab = self._get_vocab(model, model_cache)
        doc = Doc(vocab).from_bytes(content)

        # Restore extension values
        for ext, values in doc.user_data["extensions"].items():
            Token.set_extension(ext, default=None, force=True)
            for i in range(len(doc)):
                doc[i]._.set(ext, values[i])

        # Clean up user_data
        doc.user_data["extensions"] = list(doc.user_data["extensions"].keys())

        return doc

    # WARNING: This method is deprecated in favour of field serializer.
    def _doc_to_bytes(self) -> bytes:
        """Convert the content to bytes if it is a Doc object.

        Returns:
            bytes: The content as bytes.
        """
        if not isinstance(self.content, Doc):
            raise LexosException("Content is not a Doc object.")

        doc = self.content

        doc.user_data["extensions"] = {}
        for ext in self.extensions:
            doc.user_data["extensions"][ext] = [token._.get(ext) for token in doc]

        return doc.to_bytes()

    def _get_vocab(
        self, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None
    ) -> Vocab:
        """Get the vocabulary from the model or model cache.

        Args:
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

        Returns:
            Vocab: The vocabulary of the model.
        """
        if model_cache and not model:
            raise LexosException("Model cache provided but no model specified.")

        if model_cache:
            return model_cache.get_model(model).vocab
        elif model:
            return spacy.load(model).vocab
        elif self.model:
            return spacy.load(self.model).vocab
        else:
            raise LexosException(
                "No model specified for loading the Doc. Please provide a model name or a model cache."
            )

    @validate_call(config=model_config)
    def from_bytes(
        self,
        bytestring: bytes,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
        verify_hash: bool = True,
    ) -> None:
        """Deserialise the record from bytes.

        Args:
            bytestring (bytes): The bytes to load the record from.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
            verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
        """
        try:
            data = msgpack.unpackb(bytestring)
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize record: Invalid or corrupted data format. "
                f"Suggestion: Check if the file was completely written and not corrupted."
            ) from e

        # Verify data integrity if hash is present
        if verify_hash and "data_integrity_hash" in data:
            stored_hash = data["data_integrity_hash"]
            # Recreate hash from core data (excluding the hash itself)
            core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
            core_bytes = msgpack.dumps(core_data)
            computed_hash = hashlib.sha256(core_bytes).hexdigest()

            if stored_hash != computed_hash:
                raise LexosException(
                    f"Data integrity check failed: Hash mismatch detected. "
                    f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                    f"Suggestion: The data may be corrupted during storage or transmission. "
                    f"Try re-serializing the original document."
                )

        # Update the record with the loaded data
        for k, v in data.items():
            if k in self.model_fields:
                if k != "content":
                    setattr(self, k, v)

        # If content is bytes, convert it back to a Doc object
        if data["is_parsed"] and isinstance(data["content"], bytes):
            if not model:
                model = data.get("model")
            try:
                self.content = self._doc_from_bytes(data["content"], model, model_cache)
            except OSError as e:
                raise LexosException(
                    f"Failed to load spaCy model '{model}': {str(e)}. "
                    f"Suggestion: Install the model with 'python -m spacy download {model}' "
                    f"or use a different model available in your environment."
                ) from e
            except Exception as e:
                raise LexosException(
                    f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                    f"Suggestion: Check model compatibility - document may have been "
                    f"serialized with a different spaCy or model version."
                ) from e

    @validate_call(config=model_config)
    def from_disk(
        self,
        path: Path | str,
        model: Optional[str] = None,
        model_cache: Optional[LexosModelCache] = None,
    ) -> None:
        """Load the record from disk.

        Args:
            path (Path | str): The path to load the record from.
            model (Optional[str]): The spaCy model to use for loading the Doc.
            model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        """
        if not path:
            raise LexosException("No path specified for loading the record.")

        # Load the data from disk
        try:
            with open(path, "rb") as f:
                data = f.read()
        except FileNotFoundError as e:
            raise LexosException(
                f"Record file not found: {path}. "
                f"Suggestion: Check if the file path is correct and the file exists."
            ) from e
        except PermissionError as e:
            raise LexosException(
                f"Permission denied accessing record file: {path}. "
                f"Suggestion: Check file permissions or run with appropriate privileges."
            ) from e
        except IOError as e:
            raise LexosException(
                f"Failed to read record file: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e

        # Get the record content from the bytestring
        self.from_bytes(data, model=model, model_cache=model_cache)

    def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
        """Return the least common terms.

        Args:
            n (Optional[int]): The number of least common terms to return. If None, return all terms.

        Returns:
            list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
        """
        if self.is_parsed:
            return (
                sorted(self.terms.items(), key=lambda x: x[1])[:n]
                if n
                else sorted(self.terms.items(), key=lambda x: x[1])
            )
        else:
            raise LexosException("Record is not parsed.")

    def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
        """Return the most common terms.

        Args:
            n (Optional[int]): The number of most common terms to return. If None, return all terms.

        Returns:
            list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
        """
        if self.is_parsed:
            return self.terms.most_common(n)
        else:
            raise LexosException("Record is not parsed.")

    def num_terms(self) -> int:
        """Return the number of terms.

        Returns:
            int: The count of unique terms in this record.
        """
        if self.is_parsed:
            return len(self.terms)
        else:
            raise LexosException("Record is not parsed.")

    def num_tokens(self) -> int:
        """Return the number of tokens.

        Returns:
            int: The count of token elements in this record.
        """
        if self.is_parsed:
            return len(self.tokens)
        else:
            raise LexosException("Record is not parsed.")

    @validate_call(config=model_config)
    def set(self, **props: Any) -> None:
        """Set a record property.

        Args:
            **props (Any): A dict containing the properties to set on the record.

        Returns:
            None
        """
        for k, v in props.items():
            setattr(self, k, v)

    @validate_call(config=model_config)
    def to_bytes(
        self, extensions: Optional[list[str]] = [], include_hash: bool = True
    ) -> bytes:
        """Serialize the record to a dictionary.

        Args:
            extensions (list[str]): A list of extension names to include in the serialization.
            include_hash (bool): Whether to include data integrity hash. Defaults to True.

        Returns:
            bytes: The serialized record.
        """
        # Handle extensions
        if extensions:
            self.extensions = list(set(self.extensions + extensions))

        # Convert record to a dictionary
        # model_dump is used to create a serializable dict representation.
        # We exclude the computed fields (`terms`, `text`, `tokens`) because
        # they might trigger evaluation and raise `LexosException` for
        # unparsed `Record` objects. The saved content is handled below,
        # and `id` is stringified to ensure JSON compatibility.
        data = self.model_dump(exclude=["terms", "text", "tokens"])

        # Make UUID serialisable
        data["id"] = str(data["id"])

        # WARNING: This code is deprecated in favour of field serializer.
        # Convert the content to bytes if it is a Doc object
        if self.is_parsed:
            data["content"] = self._doc_to_bytes()

        # Add data integrity hash if requested
        if include_hash:
            # Create hash of the core data (excluding the hash itself)
            core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
            core_bytes = msgpack.dumps(core_data)
            data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

        return msgpack.dumps(data)

    @validate_call(config=model_config)
    def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
        """Save the record to disk.

        Args:
            path (Path | str): The path to save the record to.
            extensions (list[str]): A list of extension names to include in the serialization.
        """
        if not path:
            raise LexosException("No path specified for saving the record.")

        if not extensions:
            extensions = self.extensions

        # Serialize and save the record
        data = self.to_bytes(extensions)

        try:
            with open(path, "wb") as f:
                f.write(data)
        except PermissionError as e:
            raise LexosException(
                f"Permission denied writing to: {path}. "
                f"Suggestion: Check file/directory permissions or run with appropriate privileges."
            ) from e
        except OSError as e:
            if "No space left on device" in str(e):
                raise LexosException(
                    f"Insufficient disk space to save record: {path}. "
                    f"Suggestion: Free up disk space or choose a different location."
                ) from e
            else:
                raise LexosException(
                    f"Failed to write record to disk: {path}. Error: {str(e)}. "
                    f"Suggestion: Check disk space, file system health, or network connectivity."
                ) from e

    def vocab_density(self) -> float:
        """Return the vocabulary density.

        Returns:
            float: The vocabulary density of the record.
        """
        if self.is_parsed:
            return self.num_terms() / self.num_tokens()
        else:
            raise LexosException("Record is not parsed.")

`is_parsed: bool` `cached` `property` ¤

Return whether the record is parsed.

Returns:

Name	Type	Description
`bool`	`bool`	True if the record content is a spaCy Doc, False otherwise.

`preview: str` `cached` `property` ¤

Return a preview of the record text.

Returns:

Type	Description
`str`	str \| None: A shortened preview of the record content, or None if content is None.

`terms: Counter` `cached` `property` ¤

Return the terms in the record.

Returns:

Name	Type	Description
`Counter`	`Counter`	Collection mapping term -> count for the record.

`text: str` `property` ¤

Return the text of the record.

Returns:

Type	Description
`str`	str \| None: The record text as string or None if no content is present.

`tokens: list[str]` `cached` `property` ¤

Return the tokens in the record.

Returns:

Type	Description
`list[str]`	list[str]: A list of token strings extracted from the parsed content.

`repr()` ¤

Return a string representation of the record.

Source code in lexos/corpus/record.py

def __repr__(self):
    """Return a string representation of the record."""
    # We exclude `terms`, `text`, and `tokens` here because these are
    # computed / cached fields that can rely on the record being parsed.
    # For unparsed records, evaluating these computed properties will
    # raise a LexosException. `__repr__` should be lightweight and safe
    # to call in debugging contexts, so we exclude these computed fields
    # intentionally.
    fields = self.model_dump(exclude=["terms", "text", "tokens"])
    fields["is_parsed"] = str(self.is_parsed)
    if self.content and self.is_parsed:
        fields["content"] = f"{self.content.text[:25]}..."
    elif self.content and not self.is_parsed:
        fields["content"] = f"{self.content[:25]}..."
    else:
        fields["content"] = "None"
    field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
    return f"Record({', '.join(field_list)})"

`str() -> str` ¤

Return a user-friendly string representation of the record for printing.

Source code in lexos/corpus/record.py

def __str__(self) -> str:
    """Return a user-friendly string representation of the record for printing."""
    active = "True" if self.is_active else "False"
    parsed = "True" if self.is_parsed else "False"

    # Get a preview of content
    if self.content is None:
        content_preview = "None"
    elif self.is_parsed:
        content_preview = f"'{self.content.text[:40]}...'"
    else:
        content_preview = f"'{self.content[:40]}...'"

    return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"

`from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None` ¤

Deserialise the record from bytes.

Parameters:

Name	Type	Description	Default
`bytestring`	`bytes`	The bytes to load the record from.	required
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`
`verify_hash`	`bool`	Whether to verify data integrity hash. Defaults to True.	`True`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def from_bytes(
    self,
    bytestring: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
    verify_hash: bool = True,
) -> None:
    """Deserialise the record from bytes.

    Args:
        bytestring (bytes): The bytes to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
    """
    try:
        data = msgpack.unpackb(bytestring)
    except Exception as e:
        raise LexosException(
            f"Failed to deserialize record: Invalid or corrupted data format. "
            f"Suggestion: Check if the file was completely written and not corrupted."
        ) from e

    # Verify data integrity if hash is present
    if verify_hash and "data_integrity_hash" in data:
        stored_hash = data["data_integrity_hash"]
        # Recreate hash from core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        computed_hash = hashlib.sha256(core_bytes).hexdigest()

        if stored_hash != computed_hash:
            raise LexosException(
                f"Data integrity check failed: Hash mismatch detected. "
                f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                f"Suggestion: The data may be corrupted during storage or transmission. "
                f"Try re-serializing the original document."
            )

    # Update the record with the loaded data
    for k, v in data.items():
        if k in self.model_fields:
            if k != "content":
                setattr(self, k, v)

    # If content is bytes, convert it back to a Doc object
    if data["is_parsed"] and isinstance(data["content"], bytes):
        if not model:
            model = data.get("model")
        try:
            self.content = self._doc_from_bytes(data["content"], model, model_cache)
        except OSError as e:
            raise LexosException(
                f"Failed to load spaCy model '{model}': {str(e)}. "
                f"Suggestion: Install the model with 'python -m spacy download {model}' "
                f"or use a different model available in your environment."
            ) from e
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                f"Suggestion: Check model compatibility - document may have been "
                f"serialized with a different spaCy or model version."
            ) from e

`from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None` ¤

Load the record from disk.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to load the record from.	required
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def from_disk(
    self,
    path: Path | str,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> None:
    """Load the record from disk.

    Args:
        path (Path | str): The path to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
    """
    if not path:
        raise LexosException("No path specified for loading the record.")

    # Load the data from disk
    try:
        with open(path, "rb") as f:
            data = f.read()
    except FileNotFoundError as e:
        raise LexosException(
            f"Record file not found: {path}. "
            f"Suggestion: Check if the file path is correct and the file exists."
        ) from e
    except PermissionError as e:
        raise LexosException(
            f"Permission denied accessing record file: {path}. "
            f"Suggestion: Check file permissions or run with appropriate privileges."
        ) from e
    except IOError as e:
        raise LexosException(
            f"Failed to read record file: {path}. Error: {str(e)}. "
            f"Suggestion: Check disk space, file system health, or network connectivity."
        ) from e

    # Get the record content from the bytestring
    self.from_bytes(data, model=model, model_cache=model_cache)

`least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

Return the least common terms.

Parameters:

Name	Type	Description	Default
`n`	`Optional[int]`	The number of least common terms to return. If None, return all terms.	`None`

Returns:

Type	Description
`list[tuple[str, int]]`	list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.

Source code in lexos/corpus/record.py

def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the least common terms.

    Args:
        n (Optional[int]): The number of least common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
    """
    if self.is_parsed:
        return (
            sorted(self.terms.items(), key=lambda x: x[1])[:n]
            if n
            else sorted(self.terms.items(), key=lambda x: x[1])
        )
    else:
        raise LexosException("Record is not parsed.")

`most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

Return the most common terms.

Parameters:

Name	Type	Description	Default
`n`	`Optional[int]`	The number of most common terms to return. If None, return all terms.	`None`

Returns:

Type	Description
`list[tuple[str, int]]`	list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.

Source code in lexos/corpus/record.py

def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the most common terms.

    Args:
        n (Optional[int]): The number of most common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
    """
    if self.is_parsed:
        return self.terms.most_common(n)
    else:
        raise LexosException("Record is not parsed.")

`num_terms() -> int` ¤

Return the number of terms.

Returns:

Name	Type	Description
`int`	`int`	The count of unique terms in this record.

Source code in lexos/corpus/record.py

def num_terms(self) -> int:
    """Return the number of terms.

    Returns:
        int: The count of unique terms in this record.
    """
    if self.is_parsed:
        return len(self.terms)
    else:
        raise LexosException("Record is not parsed.")

`num_tokens() -> int` ¤

Return the number of tokens.

Returns:

Name	Type	Description
`int`	`int`	The count of token elements in this record.

Source code in lexos/corpus/record.py

def num_tokens(self) -> int:
    """Return the number of tokens.

    Returns:
        int: The count of token elements in this record.
    """
    if self.is_parsed:
        return len(self.tokens)
    else:
        raise LexosException("Record is not parsed.")

`serialize_content(content: Doc | str) -> bytes | str` ¤

Serialize the content to bytes if it is a Doc object.

Parameters:

Name	Type	Description	Default
`content`	`Doc \| str`	The content to serialize.	required

Returns:

Type	Description
`bytes \| str`	bytes \| str: The serialized content as bytes if it is a Doc, otherwise the original string.

Source code in lexos/corpus/record.py

@field_serializer("content")
def serialize_content(self, content: Doc | str) -> bytes | str:
    """Serialize the content to bytes if it is a Doc object.

    Args:
        content (Doc | str): The content to serialize.

    Returns:
        bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
    """
    if isinstance(content, Doc):
        content.user_data["extensions"] = {}
        for ext in self.extensions:
            content.user_data["extensions"][ext] = [
                token._.get(ext) for token in content
            ]
        return content.to_bytes()
    return content

`serialize_id(id, _info) -> str` ¤

Always serialize ID as string for JSON compatibility.

Parameters:

Name	Type	Description	Default
`id`	`UUID \| int \| str`	The ID value being serialized.	required
`_info`	`Any`	Encoder info (pydantic serializer internals).	required

Returns:

Name	Type	Description
`str`	`str`	The serialized ID as a string.

Source code in lexos/corpus/record.py

@field_serializer("id")
def serialize_id(self, id, _info) -> str:
    """Always serialize ID as string for JSON compatibility.

    Args:
        id (UUID|int|str): The ID value being serialized.
        _info (Any): Encoder info (pydantic serializer internals).

    Returns:
        str: The serialized ID as a string.
    """
    return str(id)

`serialize_meta(meta: dict[str, Any]) -> dict[str, Any]` ¤

Ensure metadata is JSON-serializable by converting special types to strings.

Source code in lexos/corpus/record.py

@field_serializer("meta")
def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
    """Ensure metadata is JSON-serializable by converting special types to strings."""
    return self._sanitize_metadata(meta)

`set(**props: Any) -> None` ¤

Set a record property.

Parameters:

Name	Type	Description	Default
`**props`	`Any`	A dict containing the properties to set on the record.	`{}`

Returns:

Type	Description
`None`	None

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def set(self, **props: Any) -> None:
    """Set a record property.

    Args:
        **props (Any): A dict containing the properties to set on the record.

    Returns:
        None
    """
    for k, v in props.items():
        setattr(self, k, v)

`to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes` ¤

Serialize the record to a dictionary.

Parameters:

Name	Type	Description	Default
`extensions`	`list[str]`	A list of extension names to include in the serialization.	`[]`
`include_hash`	`bool`	Whether to include data integrity hash. Defaults to True.	`True`

Returns:

Name	Type	Description
`bytes`	`bytes`	The serialized record.

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def to_bytes(
    self, extensions: Optional[list[str]] = [], include_hash: bool = True
) -> bytes:
    """Serialize the record to a dictionary.

    Args:
        extensions (list[str]): A list of extension names to include in the serialization.
        include_hash (bool): Whether to include data integrity hash. Defaults to True.

    Returns:
        bytes: The serialized record.
    """
    # Handle extensions
    if extensions:
        self.extensions = list(set(self.extensions + extensions))

    # Convert record to a dictionary
    # model_dump is used to create a serializable dict representation.
    # We exclude the computed fields (`terms`, `text`, `tokens`) because
    # they might trigger evaluation and raise `LexosException` for
    # unparsed `Record` objects. The saved content is handled below,
    # and `id` is stringified to ensure JSON compatibility.
    data = self.model_dump(exclude=["terms", "text", "tokens"])

    # Make UUID serialisable
    data["id"] = str(data["id"])

    # WARNING: This code is deprecated in favour of field serializer.
    # Convert the content to bytes if it is a Doc object
    if self.is_parsed:
        data["content"] = self._doc_to_bytes()

    # Add data integrity hash if requested
    if include_hash:
        # Create hash of the core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

    return msgpack.dumps(data)

`to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None` ¤

Save the record to disk.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the record to.	required
`extensions`	`list[str]`	A list of extension names to include in the serialization.	`None`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
    """Save the record to disk.

    Args:
        path (Path | str): The path to save the record to.
        extensions (list[str]): A list of extension names to include in the serialization.
    """
    if not path:
        raise LexosException("No path specified for saving the record.")

    if not extensions:
        extensions = self.extensions

    # Serialize and save the record
    data = self.to_bytes(extensions)

    try:
        with open(path, "wb") as f:
            f.write(data)
    except PermissionError as e:
        raise LexosException(
            f"Permission denied writing to: {path}. "
            f"Suggestion: Check file/directory permissions or run with appropriate privileges."
        ) from e
    except OSError as e:
        if "No space left on device" in str(e):
            raise LexosException(
                f"Insufficient disk space to save record: {path}. "
                f"Suggestion: Free up disk space or choose a different location."
            ) from e
        else:
            raise LexosException(
                f"Failed to write record to disk: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e

`vocab_density() -> float` ¤

Return the vocabulary density.

Returns:

Name	Type	Description
`float`	`float`	The vocabulary density of the record.

Source code in lexos/corpus/record.py

def vocab_density(self) -> float:
    """Return the vocabulary density.

    Returns:
        float: The vocabulary density of the record.
    """
    if self.is_parsed:
        return self.num_terms() / self.num_tokens()
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

`serialize_content(content: Doc | str) -> bytes | str` ¤

Serialize the content to bytes if it is a Doc object.

Parameters:

Name	Type	Description	Default
`content`	`Doc \| str`	The content to serialize.	required

Returns:

Type	Description
`bytes \| str`	bytes \| str: The serialized content as bytes if it is a Doc, otherwise the original string.

Source code in lexos/corpus/record.py

@field_serializer("content")
def serialize_content(self, content: Doc | str) -> bytes | str:
    """Serialize the content to bytes if it is a Doc object.

    Args:
        content (Doc | str): The content to serialize.

    Returns:
        bytes | str: The serialized content as bytes if it is a Doc, otherwise the original string.
    """
    if isinstance(content, Doc):
        content.user_data["extensions"] = {}
        for ext in self.extensions:
            content.user_data["extensions"][ext] = [
                token._.get(ext) for token in content
            ]
        return content.to_bytes()
    return content

rendering:
  show_root_heading: true
  heading_level: 3

`serialize_id(id, _info) -> str` ¤

Always serialize ID as string for JSON compatibility.

Parameters:

Name	Type	Description	Default
`id`	`UUID \| int \| str`	The ID value being serialized.	required
`_info`	`Any`	Encoder info (pydantic serializer internals).	required

Returns:

Name	Type	Description
`str`	`str`	The serialized ID as a string.

Source code in lexos/corpus/record.py

@field_serializer("id")
def serialize_id(self, id, _info) -> str:
    """Always serialize ID as string for JSON compatibility.

    Args:
        id (UUID|int|str): The ID value being serialized.
        _info (Any): Encoder info (pydantic serializer internals).

    Returns:
        str: The serialized ID as a string.
    """
    return str(id)

rendering:
  show_root_heading: true
  heading_level: 3

`serialize_meta(meta: dict[str, Any]) -> dict[str, Any]` ¤

Ensure metadata is JSON-serializable by converting special types to strings.

Source code in lexos/corpus/record.py

@field_serializer("meta")
def serialize_meta(self, meta: dict[str, Any]) -> dict[str, Any]:
    """Ensure metadata is JSON-serializable by converting special types to strings."""
    return self._sanitize_metadata(meta)

rendering:
  show_root_heading: true
  heading_level: 3

`_sanitize_metadata(metadata: dict[str, Any]) -> dict[str, Any]` ¤

Convert non-JSON-serializable types to strings.

Parameters:

Name	Type	Description	Default
`metadata`	`dict[str, Any]`	Original metadata dictionary	required

Returns:

Type	Description
`dict[str, Any]`	Sanitized metadata dictionary with JSON-serializable values

Source code in lexos/corpus/record.py

def _sanitize_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
    """Convert non-JSON-serializable types to strings.

    Args:
        metadata: Original metadata dictionary

    Returns:
        Sanitized metadata dictionary with JSON-serializable values
    """
    sanitized = {}
    for key, value in metadata.items():
        if isinstance(value, UUID):
            sanitized[key] = str(value)
        elif isinstance(value, (datetime, date)):
            sanitized[key] = value.isoformat()
        elif isinstance(value, Path):
            sanitized[key] = str(value)
        elif isinstance(value, dict):
            sanitized[key] = self._sanitize_metadata(value)  # Recursive
        elif isinstance(value, list):
            sanitized[key] = [
                self._sanitize_metadata({"item": item})["item"]
                if isinstance(item, dict)
                else str(item)
                if isinstance(item, (UUID, datetime, date, Path))
                else item
                for item in value
            ]
        else:
            sanitized[key] = value

    return sanitized

rendering:
  show_root_heading: true
  heading_level: 3

`repr()` ¤

Return a string representation of the record.

Source code in lexos/corpus/record.py

def __repr__(self):
    """Return a string representation of the record."""
    # We exclude `terms`, `text`, and `tokens` here because these are
    # computed / cached fields that can rely on the record being parsed.
    # For unparsed records, evaluating these computed properties will
    # raise a LexosException. `__repr__` should be lightweight and safe
    # to call in debugging contexts, so we exclude these computed fields
    # intentionally.
    fields = self.model_dump(exclude=["terms", "text", "tokens"])
    fields["is_parsed"] = str(self.is_parsed)
    if self.content and self.is_parsed:
        fields["content"] = f"{self.content.text[:25]}..."
    elif self.content and not self.is_parsed:
        fields["content"] = f"{self.content[:25]}..."
    else:
        fields["content"] = "None"
    field_list = [f"{k}={v}" if v else f"{k}=None" for k, v in fields.items()]
    return f"Record({', '.join(field_list)})"

rendering:
  show_root_heading: true
  heading_level: 3

`str() -> str` ¤

Return a user-friendly string representation of the record for printing.

Source code in lexos/corpus/record.py

def __str__(self) -> str:
    """Return a user-friendly string representation of the record for printing."""
    active = "True" if self.is_active else "False"
    parsed = "True" if self.is_parsed else "False"

    # Get a preview of content
    if self.content is None:
        content_preview = "None"
    elif self.is_parsed:
        content_preview = f"'{self.content.text[:40]}...'"
    else:
        content_preview = f"'{self.content[:40]}...'"

    return f"Record(id={self.id}, name={self.name!r}, active={active}, parsed={parsed}, content={content_preview})"

rendering:
  show_root_heading: true
  heading_level: 3

`is_parsed: bool` `cached` `property` ¤

Return whether the record is parsed.

Returns:

Name	Type	Description
`bool`	`bool`	True if the record content is a spaCy Doc, False otherwise.

rendering:
  show_root_heading: true
  heading_level: 3

`preview: str` `cached` `property` ¤

Return a preview of the record text.

Returns:

Type	Description
`str`	str \| None: A shortened preview of the record content, or None if content is None.

rendering:
  show_root_heading: true
  heading_level: 3

`terms: Counter` `cached` `property` ¤

Return the terms in the record.

Returns:

Name	Type	Description
`Counter`	`Counter`	Collection mapping term -> count for the record.

rendering:
  show_root_heading: true
  heading_level: 3

`text: str` `property` ¤

Return the text of the record.

Returns:

Type	Description
`str`	str \| None: The record text as string or None if no content is present.

rendering:
  show_root_heading: true
  heading_level: 3

`tokens: list[str]` `cached` `property` ¤

Return the tokens in the record.

Returns:

Type	Description
`list[str]`	list[str]: A list of token strings extracted from the parsed content.

rendering:
  show_root_heading: true
  heading_level: 3

`_doc_from_bytes(content: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Doc` ¤

Convert bytes to a Doc object.

Parameters:

Name	Type	Description	Default
`content`	`bytes`	The bytes to convert.	required
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`

Returns:

Name	Type	Description
`Doc`	`Doc`	The content as a Doc object.

Source code in lexos/corpus/record.py

def _doc_from_bytes(
    self,
    content: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> Doc:
    """Convert bytes to a Doc object.

    Args:
        content (bytes): The bytes to convert.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

    Returns:
        Doc: The content as a Doc object.
    """
    # Create a Doc from the bytes
    vocab = self._get_vocab(model, model_cache)
    doc = Doc(vocab).from_bytes(content)

    # Restore extension values
    for ext, values in doc.user_data["extensions"].items():
        Token.set_extension(ext, default=None, force=True)
        for i in range(len(doc)):
            doc[i]._.set(ext, values[i])

    # Clean up user_data
    doc.user_data["extensions"] = list(doc.user_data["extensions"].keys())

    return doc

rendering:
  show_root_heading: true
  heading_level: 3

`_doc_to_bytes() -> bytes` ¤

Convert the content to bytes if it is a Doc object.

Returns:

Name	Type	Description
`bytes`	`bytes`	The content as bytes.

Source code in lexos/corpus/record.py

def _doc_to_bytes(self) -> bytes:
    """Convert the content to bytes if it is a Doc object.

    Returns:
        bytes: The content as bytes.
    """
    if not isinstance(self.content, Doc):
        raise LexosException("Content is not a Doc object.")

    doc = self.content

    doc.user_data["extensions"] = {}
    for ext in self.extensions:
        doc.user_data["extensions"][ext] = [token._.get(ext) for token in doc]

    return doc.to_bytes()

rendering:
  show_root_heading: true
  heading_level: 3

`_get_vocab(model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Vocab` ¤

Get the vocabulary from the model or model cache.

Parameters:

Name	Type	Description	Default
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`

Returns:

Name	Type	Description
`Vocab`	`Vocab`	The vocabulary of the model.

Source code in lexos/corpus/record.py

def _get_vocab(
    self, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None
) -> Vocab:
    """Get the vocabulary from the model or model cache.

    Args:
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.

    Returns:
        Vocab: The vocabulary of the model.
    """
    if model_cache and not model:
        raise LexosException("Model cache provided but no model specified.")

    if model_cache:
        return model_cache.get_model(model).vocab
    elif model:
        return spacy.load(model).vocab
    elif self.model:
        return spacy.load(self.model).vocab
    else:
        raise LexosException(
            "No model specified for loading the Doc. Please provide a model name or a model cache."
        )

rendering:
  show_root_heading: true
  heading_level: 3

`from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None` ¤

Deserialise the record from bytes.

Parameters:

Name	Type	Description	Default
`bytestring`	`bytes`	The bytes to load the record from.	required
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`
`verify_hash`	`bool`	Whether to verify data integrity hash. Defaults to True.	`True`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def from_bytes(
    self,
    bytestring: bytes,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
    verify_hash: bool = True,
) -> None:
    """Deserialise the record from bytes.

    Args:
        bytestring (bytes): The bytes to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
        verify_hash (bool): Whether to verify data integrity hash. Defaults to True.
    """
    try:
        data = msgpack.unpackb(bytestring)
    except Exception as e:
        raise LexosException(
            f"Failed to deserialize record: Invalid or corrupted data format. "
            f"Suggestion: Check if the file was completely written and not corrupted."
        ) from e

    # Verify data integrity if hash is present
    if verify_hash and "data_integrity_hash" in data:
        stored_hash = data["data_integrity_hash"]
        # Recreate hash from core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        computed_hash = hashlib.sha256(core_bytes).hexdigest()

        if stored_hash != computed_hash:
            raise LexosException(
                f"Data integrity check failed: Hash mismatch detected. "
                f"Expected: {stored_hash[:16]}..., Got: {computed_hash[:16]}... "
                f"Suggestion: The data may be corrupted during storage or transmission. "
                f"Try re-serializing the original document."
            )

    # Update the record with the loaded data
    for k, v in data.items():
        if k in self.model_fields:
            if k != "content":
                setattr(self, k, v)

    # If content is bytes, convert it back to a Doc object
    if data["is_parsed"] and isinstance(data["content"], bytes):
        if not model:
            model = data.get("model")
        try:
            self.content = self._doc_from_bytes(data["content"], model, model_cache)
        except OSError as e:
            raise LexosException(
                f"Failed to load spaCy model '{model}': {str(e)}. "
                f"Suggestion: Install the model with 'python -m spacy download {model}' "
                f"or use a different model available in your environment."
            ) from e
        except Exception as e:
            raise LexosException(
                f"Failed to deserialize spaCy document with model '{model}': {str(e)}. "
                f"Suggestion: Check model compatibility - document may have been "
                f"serialized with a different spaCy or model version."
            ) from e

rendering:
  show_root_heading: true
  heading_level: 3

`from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None` ¤

Load the record from disk.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to load the record from.	required
`model`	`Optional[str]`	The spaCy model to use for loading the Doc.	`None`
`model_cache`	`Optional[LexosModelCache]`	An optional cache for spaCy models.	`None`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def from_disk(
    self,
    path: Path | str,
    model: Optional[str] = None,
    model_cache: Optional[LexosModelCache] = None,
) -> None:
    """Load the record from disk.

    Args:
        path (Path | str): The path to load the record from.
        model (Optional[str]): The spaCy model to use for loading the Doc.
        model_cache (Optional[LexosModelCache]): An optional cache for spaCy models.
    """
    if not path:
        raise LexosException("No path specified for loading the record.")

    # Load the data from disk
    try:
        with open(path, "rb") as f:
            data = f.read()
    except FileNotFoundError as e:
        raise LexosException(
            f"Record file not found: {path}. "
            f"Suggestion: Check if the file path is correct and the file exists."
        ) from e
    except PermissionError as e:
        raise LexosException(
            f"Permission denied accessing record file: {path}. "
            f"Suggestion: Check file permissions or run with appropriate privileges."
        ) from e
    except IOError as e:
        raise LexosException(
            f"Failed to read record file: {path}. Error: {str(e)}. "
            f"Suggestion: Check disk space, file system health, or network connectivity."
        ) from e

    # Get the record content from the bytestring
    self.from_bytes(data, model=model, model_cache=model_cache)

rendering:
  show_root_heading: true
  heading_level: 3

`least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

Return the least common terms.

Parameters:

Name	Type	Description	Default
`n`	`Optional[int]`	The number of least common terms to return. If None, return all terms.	`None`

Returns:

Type	Description
`list[tuple[str, int]]`	list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.

Source code in lexos/corpus/record.py

def least_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the least common terms.

    Args:
        n (Optional[int]): The number of least common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by least frequent.
    """
    if self.is_parsed:
        return (
            sorted(self.terms.items(), key=lambda x: x[1])[:n]
            if n
            else sorted(self.terms.items(), key=lambda x: x[1])
        )
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

`most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

Return the most common terms.

Parameters:

Name	Type	Description	Default
`n`	`Optional[int]`	The number of most common terms to return. If None, return all terms.	`None`

Returns:

Type	Description
`list[tuple[str, int]]`	list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.

Source code in lexos/corpus/record.py

def most_common_terms(self, n: Optional[int] = None) -> list[tuple[str, int]]:
    """Return the most common terms.

    Args:
        n (Optional[int]): The number of most common terms to return. If None, return all terms.

    Returns:
        list[tuple[str, int]]: A list of (term, count) pairs sorted by most frequent.
    """
    if self.is_parsed:
        return self.terms.most_common(n)
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

`num_terms() -> int` ¤

Return the number of terms.

Returns:

Name	Type	Description
`int`	`int`	The count of unique terms in this record.

Source code in lexos/corpus/record.py

def num_terms(self) -> int:
    """Return the number of terms.

    Returns:
        int: The count of unique terms in this record.
    """
    if self.is_parsed:
        return len(self.terms)
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

`num_tokens() -> int` ¤

Return the number of tokens.

Returns:

Name	Type	Description
`int`	`int`	The count of token elements in this record.

Source code in lexos/corpus/record.py

def num_tokens(self) -> int:
    """Return the number of tokens.

    Returns:
        int: The count of token elements in this record.
    """
    if self.is_parsed:
        return len(self.tokens)
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

`set(**props: Any) -> None` ¤

Set a record property.

Parameters:

Name	Type	Description	Default
`**props`	`Any`	A dict containing the properties to set on the record.	`{}`

Returns:

Type	Description
`None`	None

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def set(self, **props: Any) -> None:
    """Set a record property.

    Args:
        **props (Any): A dict containing the properties to set on the record.

    Returns:
        None
    """
    for k, v in props.items():
        setattr(self, k, v)

rendering:
  show_root_heading: true
  heading_level: 3

`to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes` ¤

Serialize the record to a dictionary.

Parameters:

Name	Type	Description	Default
`extensions`	`list[str]`	A list of extension names to include in the serialization.	`[]`
`include_hash`	`bool`	Whether to include data integrity hash. Defaults to True.	`True`

Returns:

Name	Type	Description
`bytes`	`bytes`	The serialized record.

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def to_bytes(
    self, extensions: Optional[list[str]] = [], include_hash: bool = True
) -> bytes:
    """Serialize the record to a dictionary.

    Args:
        extensions (list[str]): A list of extension names to include in the serialization.
        include_hash (bool): Whether to include data integrity hash. Defaults to True.

    Returns:
        bytes: The serialized record.
    """
    # Handle extensions
    if extensions:
        self.extensions = list(set(self.extensions + extensions))

    # Convert record to a dictionary
    # model_dump is used to create a serializable dict representation.
    # We exclude the computed fields (`terms`, `text`, `tokens`) because
    # they might trigger evaluation and raise `LexosException` for
    # unparsed `Record` objects. The saved content is handled below,
    # and `id` is stringified to ensure JSON compatibility.
    data = self.model_dump(exclude=["terms", "text", "tokens"])

    # Make UUID serialisable
    data["id"] = str(data["id"])

    # WARNING: This code is deprecated in favour of field serializer.
    # Convert the content to bytes if it is a Doc object
    if self.is_parsed:
        data["content"] = self._doc_to_bytes()

    # Add data integrity hash if requested
    if include_hash:
        # Create hash of the core data (excluding the hash itself)
        core_data = {k: v for k, v in data.items() if k != "data_integrity_hash"}
        core_bytes = msgpack.dumps(core_data)
        data["data_integrity_hash"] = hashlib.sha256(core_bytes).hexdigest()

    return msgpack.dumps(data)

rendering:
  show_root_heading: true
  heading_level: 3

`to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None` ¤

Save the record to disk.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the record to.	required
`extensions`	`list[str]`	A list of extension names to include in the serialization.	`None`

Source code in lexos/corpus/record.py

@validate_call(config=model_config)
def to_disk(self, path: Path | str, extensions: Optional[list[str]] = None) -> None:
    """Save the record to disk.

    Args:
        path (Path | str): The path to save the record to.
        extensions (list[str]): A list of extension names to include in the serialization.
    """
    if not path:
        raise LexosException("No path specified for saving the record.")

    if not extensions:
        extensions = self.extensions

    # Serialize and save the record
    data = self.to_bytes(extensions)

    try:
        with open(path, "wb") as f:
            f.write(data)
    except PermissionError as e:
        raise LexosException(
            f"Permission denied writing to: {path}. "
            f"Suggestion: Check file/directory permissions or run with appropriate privileges."
        ) from e
    except OSError as e:
        if "No space left on device" in str(e):
            raise LexosException(
                f"Insufficient disk space to save record: {path}. "
                f"Suggestion: Free up disk space or choose a different location."
            ) from e
        else:
            raise LexosException(
                f"Failed to write record to disk: {path}. Error: {str(e)}. "
                f"Suggestion: Check disk space, file system health, or network connectivity."
            ) from e

rendering:
  show_root_heading: true
  heading_level: 3

`vocab_density() -> float` ¤

Return the vocabulary density.

Returns:

Name	Type	Description
`float`	`float`	The vocabulary density of the record.

Source code in lexos/corpus/record.py

def vocab_density(self) -> float:
    """Return the vocabulary density.

    Returns:
        float: The vocabulary density of the record.
    """
    if self.is_parsed:
        return self.num_terms() / self.num_tokens()
    else:
        raise LexosException("Record is not parsed.")

rendering:
  show_root_heading: true
  heading_level: 3

record¤

Module Description¤

Record pydantic-model ¤

is_parsed: bool cached property ¤

preview: str cached property ¤

terms: Counter cached property ¤

text: str property ¤

tokens: list[str] cached property ¤

__repr__() ¤

__str__() -> str ¤

from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None ¤

from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None ¤

least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

num_terms() -> int ¤

num_tokens() -> int ¤

serialize_content(content: Doc | str) -> bytes | str ¤

serialize_id(id, _info) -> str ¤

serialize_meta(meta: dict[str, Any]) -> dict[str, Any] ¤

set(**props: Any) -> None ¤

to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes ¤

to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None ¤

vocab_density() -> float ¤

serialize_content(content: Doc | str) -> bytes | str ¤

serialize_id(id, _info) -> str ¤

serialize_meta(meta: dict[str, Any]) -> dict[str, Any] ¤

_sanitize_metadata(metadata: dict[str, Any]) -> dict[str, Any] ¤

__repr__() ¤

__str__() -> str ¤

is_parsed: bool cached property ¤

preview: str cached property ¤

terms: Counter cached property ¤

text: str property ¤

tokens: list[str] cached property ¤

_doc_from_bytes(content: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Doc ¤

_doc_to_bytes() -> bytes ¤

_get_vocab(model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Vocab ¤

from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None ¤

from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None ¤

least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]] ¤

num_terms() -> int ¤

num_tokens() -> int ¤

set(**props: Any) -> None ¤

to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes ¤

to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None ¤

vocab_density() -> float ¤

`Record` `pydantic-model` ¤

`is_parsed: bool` `cached` `property` ¤

`preview: str` `cached` `property` ¤

`terms: Counter` `cached` `property` ¤

`text: str` `property` ¤

`tokens: list[str]` `cached` `property` ¤

`repr()` ¤

`str() -> str` ¤

`from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None` ¤

`from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None` ¤

`least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

`most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

`num_terms() -> int` ¤

`num_tokens() -> int` ¤

`serialize_content(content: Doc | str) -> bytes | str` ¤

`serialize_id(id, _info) -> str` ¤

`serialize_meta(meta: dict[str, Any]) -> dict[str, Any]` ¤

`set(**props: Any) -> None` ¤

`to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes` ¤

`to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None` ¤

`vocab_density() -> float` ¤

`serialize_content(content: Doc | str) -> bytes | str` ¤

`serialize_id(id, _info) -> str` ¤

`serialize_meta(meta: dict[str, Any]) -> dict[str, Any]` ¤

`_sanitize_metadata(metadata: dict[str, Any]) -> dict[str, Any]` ¤

`repr()` ¤

`str() -> str` ¤

`is_parsed: bool` `cached` `property` ¤

`preview: str` `cached` `property` ¤

`terms: Counter` `cached` `property` ¤

`text: str` `property` ¤

`tokens: list[str]` `cached` `property` ¤

`_doc_from_bytes(content: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Doc` ¤

`_doc_to_bytes() -> bytes` ¤

`_get_vocab(model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> Vocab` ¤

`from_bytes(bytestring: bytes, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None, verify_hash: bool = True) -> None` ¤

`from_disk(path: Path | str, model: Optional[str] = None, model_cache: Optional[LexosModelCache] = None) -> None` ¤

`least_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

`most_common_terms(n: Optional[int] = None) -> list[tuple[str, int]]` ¤

`num_terms() -> int` ¤

`num_tokens() -> int` ¤

`set(**props: Any) -> None` ¤

`to_bytes(extensions: Optional[list[str]] = [], include_hash: bool = True) -> bytes` ¤

`to_disk(path: Path | str, extensions: Optional[list[str]] = None) -> None` ¤

`vocab_density() -> float` ¤