Skip to content

Data Loader¤

The DataLoader Class¤

The DataLoader class is the main class for loading datasets in various formats. It tries to be "smart" detecting the format as well as can be done so that you can use a common interface to load content regardless of source.

Dataset dataclass ¤

Dataset class.

Source code in lexos/io/data_loader.py
@dataclass
class Dataset:
    """Dataset class."""

    name: str
    path: str
    mime_type: str
    text: str

DataLoader pydantic-model ¤

Bases: BaseLoader

DataLoader.

Config:

  • arbitrary_types_allowed: True

Fields:

Source code in lexos/io/data_loader.py
class DataLoader(BaseLoader):
    """DataLoader."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self):
        """Initialize the DataLoader."""
        super().__init__()

    def __iter__(self) -> Generator[Dataset, None, None]:
        """Make the class iterable.

        Yields:
            Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

        Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
        """
        for i in range(len(self.data["paths"])):
            yield Dataset(
                name=self.data["names"][i],
                path=self.data["paths"][i],
                mime_type=self.data["mime_types"][i],
                text=self.data["texts"][i],
            )

    def _update_data(
        self, path: Path | str, df: pd.DataFrame, mime_type: str = "text/plain"
    ) -> None:
        """Update the DataLoader.

        Args:
            path (Path | str): The path to the file.
            df (pd.DataFrame): The DataFrame to update with.
            mime_type (str): The mime type of the file.
        """
        self.names = self.names + df["name"].tolist()
        length = len(self.names)
        self.paths = self.paths + [str(path)] * length
        self.mime_types = self.mime_types + [mime_type] * length
        self.texts = self.texts + [decode(text) for text in df["text"].tolist()]

    @validate_call(config=model_config)
    def load_csv(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        name_col: Optional[str] = "name",
        text_col: Optional[str] = "text",
        **kwargs,
    ) -> None:
        """Load a csv file.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            name_col (Optional[str]): The column name for the names.
            text_col (Optional[str]): The column name for the texts.
        """
        try:
            df = pd.read_csv(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "csv_string"
        if "sep" in kwargs and kwargs["sep"] == "\t":
            mime_type = "text/tab-separated-values"
        else:
            mime_type = "text/csv"
        if name_col:
            df = df.rename(columns={name_col: "name"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "CSV and TSV files must contain headers named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_col` and `text_col` parameters.",
            )
            raise LexosException("".join(err))
        self._update_data(path, df, mime_type)

    # @validate_call(config=model_config)
    def load_dataset(self, dataset: Self) -> None:
        """Load a dataset.

        Args:
            dataset (DataLoader): The dataset to load.

        Note: As of v2.10.5, Pydantic does not support recursive types (Self).
            As a result, this method performs its own check to see if the
            value of `dataset` is of type `DataSet`.
        """
        if not isinstance(dataset, DataLoader):
            raise LexosException("Invalid dataset type.")
        self.paths = self.paths + dataset.paths
        self.mime_types = self.mime_types + dataset.mime_types
        self.names = self.names + dataset.names
        self.texts = self.texts + dataset.texts

    # Skipped for coverage, same method as load_csv
    @validate_call(config=model_config)  # pragma: no cover
    def load_excel(  # pragma: no cover
        self, path: Path | str, name_col: str, text_col: str, **kwargs
    ) -> None:
        """Load an Excel file.

        Args:
            path (Path | str): The path to the file.
            name_col (str): The column name for the names.
            text_col (str): The column name for the texts.
        """
        try:
            df = pd.read_csv(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "buffer"
        if name_col:
            df = df.rename(columns={name_col: "name"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "Excel files must contain headers named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_col` and `text_col` parameters.",
            )
            raise LexosException("".join(err))
        self._update(
            path,
            df,
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        )

    @validate_call(config=model_config)
    def load_json(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        name_field: Optional[str] = "name",
        text_field: Optional[str] = "text",
        **kwargs,
    ) -> None:
        """Load a JSON file.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            name_field (Optional[str] = ): The field name for the names.
            text_field (Optional[str] = ): The field name for the texts.
        """
        try:
            df = pd.read_json(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "json_string"
        if name_field:
            df = df.rename(columns={name_field: "name"})
        if text_field:
            df = df.rename(columns={text_field: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "JSON files must contain fields named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_field` and `text_field` parameters.",
            )
            raise LexosException("".join(err))
        self._update_data(path, df, "application/json")

    @validate_call(config=model_config)
    def load_lineated_text(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        names: Optional[list[str]] = None,
        start: Optional[int] = 1,
        zero_pad: Optional[str] = "03",
    ) -> None:
        """Load a list of texts.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            names (Optional[list[str]]): The list of names for the texts.
            start (Optional[int]): The starting index for the names if no list is provided.
            zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
        """
        try:
            with open(path, "rb") as f:
                texts = f.readlines()
        except (FileNotFoundError, IOError, OSError):
            texts = path.split("\n")
        except BaseException as e:
            raise LexosException(e)
        if names is None:
            names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
        self.paths = ["text_string"] * len(texts)
        self.names = names
        self.mime_types = ["text/plain"] * len(texts)
        self.texts = [decode(text) for text in texts]

data: dict[str, list] property ¤

Get the data as a dictionary.

Returns:

Type Description
dict[str, list]

dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

df: pd.DataFrame property ¤

Get a pandas DataFrame of file records.

Returns:

Type Description
DataFrame

pandas.DataFrame: A DataFrame containing file metadata and content.

errors: list = [] pydantic-field ¤

The list of loading errors.

mime_types: list = [] pydantic-field ¤

The list of text mime types.

names: list = [] pydantic-field ¤

The list of text names.

paths: list = [] pydantic-field ¤

The list of paths.

records: list[dict[str, str]] property ¤

Get a list of file records.

Returns:

Type Description
list[dict[str, str]]

list[dict]: List of dictionaries containing file metadata and content.

list[dict[str, str]]

Each dict has keys: path, mime_type, name, text

Raises:

Type Description
ValueError

If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

texts: list = [] pydantic-field ¤

The list of loaded texts.

__init__() ¤

Initialize the DataLoader.

Source code in lexos/io/data_loader.py
def __init__(self):
    """Initialize the DataLoader."""
    super().__init__()

__iter__() -> Generator[Dataset, None, None] ¤

Make the class iterable.

Yields:

Name Type Description
Dataset Dataset

A Dataset object containing the name, path, mime_type, and text of each dataset item.

Note: Overrides the BaseLoader's iter method to yield Dataset objects.

Source code in lexos/io/data_loader.py
def __iter__(self) -> Generator[Dataset, None, None]:
    """Make the class iterable.

    Yields:
        Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

    Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
    """
    for i in range(len(self.data["paths"])):
        yield Dataset(
            name=self.data["names"][i],
            path=self.data["paths"][i],
            mime_type=self.data["mime_types"][i],
            text=self.data["texts"][i],
        )

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

Deduplicate a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]]

The columns to consider for deduplication.

None

Returns:

Type Description
DataFrame

pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None ¤

Load a csv file.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
name_col Optional[str]

The column name for the names.

'name'
text_col Optional[str]

The column name for the texts.

'text'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_csv(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_col: Optional[str] = "name",
    text_col: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a csv file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_col (Optional[str]): The column name for the names.
        text_col (Optional[str]): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "csv_string"
    if "sep" in kwargs and kwargs["sep"] == "\t":
        mime_type = "text/tab-separated-values"
    else:
        mime_type = "text/csv"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "CSV and TSV files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, mime_type)

load_dataset(dataset: Self) -> None ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataSet.

Source code in lexos/io/data_loader.py
def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataSet`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None ¤

Load an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
name_col str

The column name for the names.

required
text_col str

The column name for the texts.

required
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)  # pragma: no cover
def load_excel(  # pragma: no cover
    self, path: Path | str, name_col: str, text_col: str, **kwargs
) -> None:
    """Load an Excel file.

    Args:
        path (Path | str): The path to the file.
        name_col (str): The column name for the names.
        text_col (str): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "buffer"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "Excel files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update(
        path,
        df,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    )

load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None ¤

Load a JSON file.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
name_field Optional[str] =

The field name for the names.

'name'
text_field Optional[str] =

The field name for the texts.

'text'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_json(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_field: Optional[str] = "name",
    text_field: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a JSON file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_field (Optional[str] = ): The field name for the names.
        text_field (Optional[str] = ): The field name for the texts.
    """
    try:
        df = pd.read_json(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "json_string"
    if name_field:
        df = df.rename(columns={name_field: "name"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "JSON files must contain fields named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_field` and `text_field` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, "application/json")

load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

Load a list of texts.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
names Optional[list[str]]

The list of names for the texts.

None
start Optional[int]

The starting index for the names if no list is provided.

1
zero_pad Optional[str]

The zero padding for the names increments if no list is provided.

'03'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_lineated_text(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    try:
        with open(path, "rb") as f:
            texts = f.readlines()
    except (FileNotFoundError, IOError, OSError):
        texts = path.split("\n")
    except BaseException as e:
        raise LexosException(e)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    self.paths = ["text_string"] * len(texts)
    self.names = names
    self.mime_types = ["text/plain"] * len(texts)
    self.texts = [decode(text) for text in texts]

reset() -> None ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

Show duplicates in a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]] = None

The columns to consider for checking duplicates.

None

Returns:

Type Description
DataFrame | None

pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

to_csv(path: Path | str, **kwargs) -> None ¤

Save the data to a csv file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_excel(path: Path | str, **kwargs) -> None ¤

Save the data to an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_json(path: Path | str, **kwargs) -> None ¤

Save the data to a json file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

__init__() ¤

Initialize the DataLoader.

Source code in lexos/io/data_loader.py
def __init__(self):
    """Initialize the DataLoader."""
    super().__init__()

__iter__() -> Generator[Dataset, None, None] ¤

Make the class iterable.

Yields:

Name Type Description
Dataset Dataset

A Dataset object containing the name, path, mime_type, and text of each dataset item.

Note: Overrides the BaseLoader's iter method to yield Dataset objects.

Source code in lexos/io/data_loader.py
def __iter__(self) -> Generator[Dataset, None, None]:
    """Make the class iterable.

    Yields:
        Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

    Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
    """
    for i in range(len(self.data["paths"])):
        yield Dataset(
            name=self.data["names"][i],
            path=self.data["paths"][i],
            mime_type=self.data["mime_types"][i],
            text=self.data["texts"][i],
        )

_update_data(path: Path | str, df: pd.DataFrame, mime_type: str = 'text/plain') -> None ¤

Update the DataLoader.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
df DataFrame

The DataFrame to update with.

required
mime_type str

The mime type of the file.

'text/plain'
Source code in lexos/io/data_loader.py
def _update_data(
    self, path: Path | str, df: pd.DataFrame, mime_type: str = "text/plain"
) -> None:
    """Update the DataLoader.

    Args:
        path (Path | str): The path to the file.
        df (pd.DataFrame): The DataFrame to update with.
        mime_type (str): The mime type of the file.
    """
    self.names = self.names + df["name"].tolist()
    length = len(self.names)
    self.paths = self.paths + [str(path)] * length
    self.mime_types = self.mime_types + [mime_type] * length
    self.texts = self.texts + [decode(text) for text in df["text"].tolist()]

load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None ¤

Load a csv file.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
name_col Optional[str]

The column name for the names.

'name'
text_col Optional[str]

The column name for the texts.

'text'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_csv(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_col: Optional[str] = "name",
    text_col: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a csv file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_col (Optional[str]): The column name for the names.
        text_col (Optional[str]): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "csv_string"
    if "sep" in kwargs and kwargs["sep"] == "\t":
        mime_type = "text/tab-separated-values"
    else:
        mime_type = "text/csv"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "CSV and TSV files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, mime_type)

load_dataset(dataset: Self) -> None ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataSet.

Source code in lexos/io/data_loader.py
def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataSet`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None ¤

Load an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
name_col str

The column name for the names.

required
text_col str

The column name for the texts.

required
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)  # pragma: no cover
def load_excel(  # pragma: no cover
    self, path: Path | str, name_col: str, text_col: str, **kwargs
) -> None:
    """Load an Excel file.

    Args:
        path (Path | str): The path to the file.
        name_col (str): The column name for the names.
        text_col (str): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "buffer"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "Excel files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update(
        path,
        df,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    )

load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None ¤

Load a JSON file.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
name_field Optional[str] =

The field name for the names.

'name'
text_field Optional[str] =

The field name for the texts.

'text'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_json(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_field: Optional[str] = "name",
    text_field: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a JSON file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_field (Optional[str] = ): The field name for the names.
        text_field (Optional[str] = ): The field name for the texts.
    """
    try:
        df = pd.read_json(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "json_string"
    if name_field:
        df = df.rename(columns={name_field: "name"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "JSON files must contain fields named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_field` and `text_field` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, "application/json")

load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

Load a list of texts.

Parameters:

Name Type Description Default
path StringIO | PathLike | Path | str

The path to the file.

required
names Optional[list[str]]

The list of names for the texts.

None
start Optional[int]

The starting index for the names if no list is provided.

1
zero_pad Optional[str]

The zero padding for the names increments if no list is provided.

'03'
Source code in lexos/io/data_loader.py
@validate_call(config=model_config)
def load_lineated_text(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    try:
        with open(path, "rb") as f:
            texts = f.readlines()
    except (FileNotFoundError, IOError, OSError):
        texts = path.split("\n")
    except BaseException as e:
        raise LexosException(e)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    self.paths = ["text_string"] * len(texts)
    self.names = names
    self.mime_types = ["text/plain"] * len(texts)
    self.texts = [decode(text) for text in texts]

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

Show duplicates in a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]] = None

The columns to consider for checking duplicates.

None

Returns:

Type Description
DataFrame | None

pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

to_csv(path: Path | str, **kwargs) -> None ¤

Save the data to a csv file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_excel(path: Path | str, **kwargs) -> None ¤

Save the data to an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_json(path: Path | str, **kwargs) -> None ¤

Save the data to a json file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)