Data Loader¤

The `DataLoader` Class¤

The DataLoader class is the main class for loading datasets in various formats. It tries to be "smart" detecting the format as well as can be done so that you can use a common interface to load content regardless of source.

`Dataset` `dataclass` ¤

Dataset class.

Source code in lexos/io/data_loader.py

@dataclass
class Dataset:
    """Dataset class."""

    name: str
    path: str
    mime_type: str
    text: str

`DataLoader` `pydantic-model` ¤

Bases: BaseLoader

DataLoader.

Config:

arbitrary_types_allowed: True

Fields:

paths (list)
mime_types (list)
names (list)
texts (list)
errors (list)

Source code in lexos/io/data_loader.py

class DataLoader(BaseLoader):
    """DataLoader."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self):
        """Initialize the DataLoader."""
        super().__init__()

    def __iter__(self) -> Generator[Dataset, None, None]:
        """Make the class iterable.

        Yields:
            Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

        Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
        """
        for i in range(len(self.data["paths"])):
            yield Dataset(
                name=self.data["names"][i],
                path=self.data["paths"][i],
                mime_type=self.data["mime_types"][i],
                text=self.data["texts"][i],
            )

    def _update_data(
        self, path: Path | str, df: pd.DataFrame, mime_type: str = "text/plain"
    ) -> None:
        """Update the DataLoader.

        Args:
            path (Path | str): The path to the file.
            df (pd.DataFrame): The DataFrame to update with.
            mime_type (str): The mime type of the file.
        """
        self.names = self.names + df["name"].tolist()
        length = len(self.names)
        self.paths = self.paths + [str(path)] * length
        self.mime_types = self.mime_types + [mime_type] * length
        self.texts = self.texts + [decode(text) for text in df["text"].tolist()]

    @validate_call(config=model_config)
    def load_csv(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        name_col: Optional[str] = "name",
        text_col: Optional[str] = "text",
        **kwargs,
    ) -> None:
        """Load a csv file.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            name_col (Optional[str]): The column name for the names.
            text_col (Optional[str]): The column name for the texts.
        """
        try:
            df = pd.read_csv(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "csv_string"
        if "sep" in kwargs and kwargs["sep"] == "\t":
            mime_type = "text/tab-separated-values"
        else:
            mime_type = "text/csv"
        if name_col:
            df = df.rename(columns={name_col: "name"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "CSV and TSV files must contain headers named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_col` and `text_col` parameters.",
            )
            raise LexosException("".join(err))
        self._update_data(path, df, mime_type)

    # @validate_call(config=model_config)
    def load_dataset(self, dataset: Self) -> None:
        """Load a dataset.

        Args:
            dataset (DataLoader): The dataset to load.

        Note: As of v2.10.5, Pydantic does not support recursive types (Self).
            As a result, this method performs its own check to see if the
            value of `dataset` is of type `DataSet`.
        """
        if not isinstance(dataset, DataLoader):
            raise LexosException("Invalid dataset type.")
        self.paths = self.paths + dataset.paths
        self.mime_types = self.mime_types + dataset.mime_types
        self.names = self.names + dataset.names
        self.texts = self.texts + dataset.texts

    # Skipped for coverage, same method as load_csv
    @validate_call(config=model_config)  # pragma: no cover
    def load_excel(  # pragma: no cover
        self, path: Path | str, name_col: str, text_col: str, **kwargs
    ) -> None:
        """Load an Excel file.

        Args:
            path (Path | str): The path to the file.
            name_col (str): The column name for the names.
            text_col (str): The column name for the texts.
        """
        try:
            df = pd.read_csv(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "buffer"
        if name_col:
            df = df.rename(columns={name_col: "name"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "Excel files must contain headers named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_col` and `text_col` parameters.",
            )
            raise LexosException("".join(err))
        self._update(
            path,
            df,
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        )

    @validate_call(config=model_config)
    def load_json(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        name_field: Optional[str] = "name",
        text_field: Optional[str] = "text",
        **kwargs,
    ) -> None:
        """Load a JSON file.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            name_field (Optional[str] = ): The field name for the names.
            text_field (Optional[str] = ): The field name for the texts.
        """
        try:
            df = pd.read_json(path, **kwargs)
        except BaseException as e:
            raise LexosException(e)
        if not isinstance(path, (Path, str)):
            path = "json_string"
        if name_field:
            df = df.rename(columns={name_field: "name"})
        if text_field:
            df = df.rename(columns={text_field: "text"})
        if "name" not in df.columns or "text" not in df.columns:
            err = (
                "JSON files must contain fields named `name` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`name_field` and `text_field` parameters.",
            )
            raise LexosException("".join(err))
        self._update_data(path, df, "application/json")

    @validate_call(config=model_config)
    def load_lineated_text(
        self,
        path: io.StringIO | os.PathLike | Path | str,
        names: Optional[list[str]] = None,
        start: Optional[int] = 1,
        zero_pad: Optional[str] = "03",
    ) -> None:
        """Load a list of texts.

        Args:
            path (io.StringIO | os.PathLike | Path | str): The path to the file.
            names (Optional[list[str]]): The list of names for the texts.
            start (Optional[int]): The starting index for the names if no list is provided.
            zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
        """
        try:
            with open(path, "rb") as f:
                texts = f.readlines()
        except (FileNotFoundError, IOError, OSError):
            texts = path.split("\n")
        except BaseException as e:
            raise LexosException(e)
        if names is None:
            names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
        self.paths = ["text_string"] * len(texts)
        self.names = names
        self.mime_types = ["text/plain"] * len(texts)
        self.texts = [decode(text) for text in texts]

`data: dict[str, list]` `property` ¤

Get the data as a dictionary.

Returns:

Type	Description
`dict[str, list]`	dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

`df: pd.DataFrame` `property` ¤

Get a pandas DataFrame of file records.

Returns:

Type	Description
`DataFrame`	pandas.DataFrame: A DataFrame containing file metadata and content.

`errors: list = []` `pydantic-field` ¤

The list of loading errors.

`mime_types: list = []` `pydantic-field` ¤

The list of text mime types.

`names: list = []` `pydantic-field` ¤

The list of text names.

`paths: list = []` `pydantic-field` ¤

The list of paths.

`records: list[dict[str, str]]` `property` ¤

Get a list of file records.

Returns:

Type	Description
`list[dict[str, str]]`	list[dict]: List of dictionaries containing file metadata and content.
`list[dict[str, str]]`	Each dict has keys: path, mime_type, name, text

Raises:

Type	Description
`ValueError`	If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

`texts: list = []` `pydantic-field` ¤

The list of loaded texts.

`init()` ¤

Initialize the DataLoader.

Source code in lexos/io/data_loader.py

def __init__(self):
    """Initialize the DataLoader."""
    super().__init__()

`iter() -> Generator[Dataset, None, None]` ¤

Make the class iterable.

Yields:

Name	Type	Description
`Dataset`	`Dataset`	A Dataset object containing the name, path, mime_type, and text of each dataset item.

Note: Overrides the BaseLoader's iter method to yield Dataset objects.

Source code in lexos/io/data_loader.py

def __iter__(self) -> Generator[Dataset, None, None]:
    """Make the class iterable.

    Yields:
        Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

    Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
    """
    for i in range(len(self.data["paths"])):
        yield Dataset(
            name=self.data["names"][i],
            path=self.data["paths"][i],
            mime_type=self.data["mime_types"][i],
            text=self.data["texts"][i],
        )

`dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame` ¤

Deduplicate a DataFrame.

Parameters:

Name	Type	Description	Default
`subset`	`Optional[list[str]]`	The columns to consider for deduplication.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

`load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None` ¤

Load a csv file.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`name_col`	`Optional[str]`	The column name for the names.	`'name'`
`text_col`	`Optional[str]`	The column name for the texts.	`'text'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_csv(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_col: Optional[str] = "name",
    text_col: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a csv file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_col (Optional[str]): The column name for the names.
        text_col (Optional[str]): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "csv_string"
    if "sep" in kwargs and kwargs["sep"] == "\t":
        mime_type = "text/tab-separated-values"
    else:
        mime_type = "text/csv"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "CSV and TSV files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, mime_type)

`load_dataset(dataset: Self) -> None` ¤

Load a dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`DataLoader`	The dataset to load.	required

As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataSet.

Source code in lexos/io/data_loader.py

def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataSet`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

`load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None` ¤

Load an Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required
`name_col`	`str`	The column name for the names.	required
`text_col`	`str`	The column name for the texts.	required

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)  # pragma: no cover
def load_excel(  # pragma: no cover
    self, path: Path | str, name_col: str, text_col: str, **kwargs
) -> None:
    """Load an Excel file.

    Args:
        path (Path | str): The path to the file.
        name_col (str): The column name for the names.
        text_col (str): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "buffer"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "Excel files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update(
        path,
        df,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    )

`load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None` ¤

Load a JSON file.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`name_field`	`Optional[str] =`	The field name for the names.	`'name'`
`text_field`	`Optional[str] =`	The field name for the texts.	`'text'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_json(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_field: Optional[str] = "name",
    text_field: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a JSON file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_field (Optional[str] = ): The field name for the names.
        text_field (Optional[str] = ): The field name for the texts.
    """
    try:
        df = pd.read_json(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "json_string"
    if name_field:
        df = df.rename(columns={name_field: "name"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "JSON files must contain fields named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_field` and `text_field` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, "application/json")

`load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

Load a list of texts.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`names`	`Optional[list[str]]`	The list of names for the texts.	`None`
`start`	`Optional[int]`	The starting index for the names if no list is provided.	`1`
`zero_pad`	`Optional[str]`	The zero padding for the names increments if no list is provided.	`'03'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_lineated_text(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    try:
        with open(path, "rb") as f:
            texts = f.readlines()
    except (FileNotFoundError, IOError, OSError):
        texts = path.split("\n")
    except BaseException as e:
        raise LexosException(e)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    self.paths = ["text_string"] * len(texts)
    self.names = names
    self.mime_types = ["text/plain"] * len(texts)
    self.texts = [decode(text) for text in texts]

`reset() -> None` ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

Show duplicates in a DataFrame.

Parameters:

Name	Type	Description	Default
`subset`	`Optional[list[str]] = None`	The columns to consider for checking duplicates.	`None`

Returns:

Type	Description
`DataFrame \| None`	pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

`to_csv(path: Path | str, **kwargs) -> None` ¤

Save the data to a csv file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_excel(path: Path | str, **kwargs) -> None` ¤

Save the data to an Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_json(path: Path | str, **kwargs) -> None` ¤

Save the data to a json file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

`init()` ¤

Initialize the DataLoader.

Source code in lexos/io/data_loader.py

def __init__(self):
    """Initialize the DataLoader."""
    super().__init__()

`iter() -> Generator[Dataset, None, None]` ¤

Make the class iterable.

Yields:

Name	Type	Description
`Dataset`	`Dataset`	A Dataset object containing the name, path, mime_type, and text of each dataset item.

Note: Overrides the BaseLoader's iter method to yield Dataset objects.

Source code in lexos/io/data_loader.py

def __iter__(self) -> Generator[Dataset, None, None]:
    """Make the class iterable.

    Yields:
        Dataset: A Dataset object containing the name, path, mime_type, and text of each dataset item.

    Note: Overrides the BaseLoader's __iter__ method to yield Dataset objects.
    """
    for i in range(len(self.data["paths"])):
        yield Dataset(
            name=self.data["names"][i],
            path=self.data["paths"][i],
            mime_type=self.data["mime_types"][i],
            text=self.data["texts"][i],
        )

`_update_data(path: Path | str, df: pd.DataFrame, mime_type: str = 'text/plain') -> None` ¤

Update the DataLoader.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required
`df`	`DataFrame`	The DataFrame to update with.	required
`mime_type`	`str`	The mime type of the file.	`'text/plain'`

Source code in lexos/io/data_loader.py

def _update_data(
    self, path: Path | str, df: pd.DataFrame, mime_type: str = "text/plain"
) -> None:
    """Update the DataLoader.

    Args:
        path (Path | str): The path to the file.
        df (pd.DataFrame): The DataFrame to update with.
        mime_type (str): The mime type of the file.
    """
    self.names = self.names + df["name"].tolist()
    length = len(self.names)
    self.paths = self.paths + [str(path)] * length
    self.mime_types = self.mime_types + [mime_type] * length
    self.texts = self.texts + [decode(text) for text in df["text"].tolist()]

`load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None` ¤

Load a csv file.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`name_col`	`Optional[str]`	The column name for the names.	`'name'`
`text_col`	`Optional[str]`	The column name for the texts.	`'text'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_csv(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_col: Optional[str] = "name",
    text_col: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a csv file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_col (Optional[str]): The column name for the names.
        text_col (Optional[str]): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "csv_string"
    if "sep" in kwargs and kwargs["sep"] == "\t":
        mime_type = "text/tab-separated-values"
    else:
        mime_type = "text/csv"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "CSV and TSV files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, mime_type)

`load_dataset(dataset: Self) -> None` ¤

Load a dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`DataLoader`	The dataset to load.	required

As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataSet.

Source code in lexos/io/data_loader.py

def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataSet`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

`load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None` ¤

Load an Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required
`name_col`	`str`	The column name for the names.	required
`text_col`	`str`	The column name for the texts.	required

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)  # pragma: no cover
def load_excel(  # pragma: no cover
    self, path: Path | str, name_col: str, text_col: str, **kwargs
) -> None:
    """Load an Excel file.

    Args:
        path (Path | str): The path to the file.
        name_col (str): The column name for the names.
        text_col (str): The column name for the texts.
    """
    try:
        df = pd.read_csv(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "buffer"
    if name_col:
        df = df.rename(columns={name_col: "name"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "Excel files must contain headers named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    self._update(
        path,
        df,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    )

`load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None` ¤

Load a JSON file.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`name_field`	`Optional[str] =`	The field name for the names.	`'name'`
`text_field`	`Optional[str] =`	The field name for the texts.	`'text'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_json(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    name_field: Optional[str] = "name",
    text_field: Optional[str] = "text",
    **kwargs,
) -> None:
    """Load a JSON file.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        name_field (Optional[str] = ): The field name for the names.
        text_field (Optional[str] = ): The field name for the texts.
    """
    try:
        df = pd.read_json(path, **kwargs)
    except BaseException as e:
        raise LexosException(e)
    if not isinstance(path, (Path, str)):
        path = "json_string"
    if name_field:
        df = df.rename(columns={name_field: "name"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "name" not in df.columns or "text" not in df.columns:
        err = (
            "JSON files must contain fields named `name` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`name_field` and `text_field` parameters.",
        )
        raise LexosException("".join(err))
    self._update_data(path, df, "application/json")

`load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

Load a list of texts.

Parameters:

Name	Type	Description	Default
`path`	`StringIO \| PathLike \| Path \| str`	The path to the file.	required
`names`	`Optional[list[str]]`	The list of names for the texts.	`None`
`start`	`Optional[int]`	The starting index for the names if no list is provided.	`1`
`zero_pad`	`Optional[str]`	The zero padding for the names increments if no list is provided.	`'03'`

Source code in lexos/io/data_loader.py

@validate_call(config=model_config)
def load_lineated_text(
    self,
    path: io.StringIO | os.PathLike | Path | str,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        path (io.StringIO | os.PathLike | Path | str): The path to the file.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    try:
        with open(path, "rb") as f:
            texts = f.readlines()
    except (FileNotFoundError, IOError, OSError):
        texts = path.split("\n")
    except BaseException as e:
        raise LexosException(e)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    self.paths = ["text_string"] * len(texts)
    self.names = names
    self.mime_types = ["text/plain"] * len(texts)
    self.texts = [decode(text) for text in texts]

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

Show duplicates in a DataFrame.

Parameters:

Name	Type	Description	Default
`subset`	`Optional[list[str]] = None`	The columns to consider for checking duplicates.	`None`

Returns:

Type	Description
`DataFrame \| None`	pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

`to_csv(path: Path | str, **kwargs) -> None` ¤

Save the data to a csv file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_excel(path: Path | str, **kwargs) -> None` ¤

Save the data to an Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_json(path: Path | str, **kwargs) -> None` ¤

Save the data to a json file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

Data Loader¤

The DataLoader Class¤

Dataset dataclass ¤

DataLoader pydantic-model ¤

data: dict[str, list] property ¤

df: pd.DataFrame property ¤

errors: list = [] pydantic-field ¤

mime_types: list = [] pydantic-field ¤

names: list = [] pydantic-field ¤

paths: list = [] pydantic-field ¤

records: list[dict[str, str]] property ¤

texts: list = [] pydantic-field ¤

__init__() ¤

__iter__() -> Generator[Dataset, None, None] ¤

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None ¤

load_dataset(dataset: Self) -> None ¤

load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None ¤

load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None ¤

load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

reset() -> None ¤

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

to_csv(path: Path | str, **kwargs) -> None ¤

to_excel(path: Path | str, **kwargs) -> None ¤

to_json(path: Path | str, **kwargs) -> None ¤

__init__() ¤

__iter__() -> Generator[Dataset, None, None] ¤

_update_data(path: Path | str, df: pd.DataFrame, mime_type: str = 'text/plain') -> None ¤

load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None ¤

load_dataset(dataset: Self) -> None ¤

load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None ¤

load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None ¤

load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

to_csv(path: Path | str, **kwargs) -> None ¤

to_excel(path: Path | str, **kwargs) -> None ¤

to_json(path: Path | str, **kwargs) -> None ¤

The `DataLoader` Class¤

`Dataset` `dataclass` ¤

`DataLoader` `pydantic-model` ¤

`data: dict[str, list]` `property` ¤

`df: pd.DataFrame` `property` ¤

`errors: list = []` `pydantic-field` ¤

`mime_types: list = []` `pydantic-field` ¤

`names: list = []` `pydantic-field` ¤

`paths: list = []` `pydantic-field` ¤

`records: list[dict[str, str]]` `property` ¤

`texts: list = []` `pydantic-field` ¤

`init()` ¤

`iter() -> Generator[Dataset, None, None]` ¤

`dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame` ¤

`load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None` ¤

`load_dataset(dataset: Self) -> None` ¤

`load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None` ¤

`load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None` ¤

`load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

`reset() -> None` ¤

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

`to_csv(path: Path | str, **kwargs) -> None` ¤

`to_excel(path: Path | str, **kwargs) -> None` ¤

`to_json(path: Path | str, **kwargs) -> None` ¤

`init()` ¤

`iter() -> Generator[Dataset, None, None]` ¤

`_update_data(path: Path | str, df: pd.DataFrame, mime_type: str = 'text/plain') -> None` ¤

`load_csv(path: io.StringIO | os.PathLike | Path | str, name_col: Optional[str] = 'name', text_col: Optional[str] = 'text', **kwargs) -> None` ¤

`load_dataset(dataset: Self) -> None` ¤

`load_excel(path: Path | str, name_col: str, text_col: str, **kwargs) -> None` ¤

`load_json(path: io.StringIO | os.PathLike | Path | str, name_field: Optional[str] = 'name', text_field: Optional[str] = 'text', **kwargs) -> None` ¤

`load_lineated_text(path: io.StringIO | os.PathLike | Path | str, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

`to_csv(path: Path | str, **kwargs) -> None` ¤

`to_excel(path: Path | str, **kwargs) -> None` ¤

`to_json(path: Path | str, **kwargs) -> None` ¤