Skip to content

base_loader¤

The BaseLoader Class¤

The BaseLoader class is the base class for all loaders in the IO module. Additional modules should inherit from BaseLoader.

BaseLoader pydantic-model ¤

Bases: BaseModel, ABC

BaseLoader.

Config:

  • arbitrary_types_allowed: True

Fields:

Source code in lexos/io/base_loader.py
class BaseLoader(BaseModel, ABC):
    """BaseLoader."""

    paths: list = Field(default=[], description="The list of paths.")
    mime_types: list = Field(default=[], description="The list of text mime types.")
    names: list = Field(default=[], description="The list of text names.")
    texts: list = Field(default=[], description="The list of loaded texts.")
    errors: list = Field(default=[], description="The list of loading errors.")

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __iter__(self) -> Generator[dict, None, None]:
        """Iterate through the records."""
        return (record for record in self.records)

    @property
    def data(self) -> dict[str, list]:
        """Get the data as a dictionary.

        Returns:
            dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.
        """
        return {
            "paths": self.paths,
            "mime_types": self.mime_types,
            "names": self.names,
            "texts": self.texts,
            "errors": self.errors,
        }

    @property
    def df(self) -> pd.DataFrame:
        """Get a pandas DataFrame of file records.

        Returns:
            pandas.DataFrame: A DataFrame containing file metadata and content.
        """
        return pd.DataFrame(self.records)

    @property
    def records(self) -> list[dict[str, str]]:
        """Get a list of file records.

        Returns:
            list[dict]: List of dictionaries containing file metadata and content.
            Each dict has keys: path, mime_type, name, text

        Raises:
            ValueError: If the lengths of paths, mime_types, names and texts don't match.

        Note:
            Validates that all lists have the same length before returning the records.
        """
        if not (
            len(self.paths)
            == len(self.mime_types)
            == len(self.names)
            == len(self.texts)
        ):
            raise LexosException("Mismatched lengths in file records data")

        return [
            {"name": name, "path": path, "mime_type": mime_type, "text": text}
            for name, path, mime_type, text in zip(
                self.names, self.paths, self.mime_types, self.texts
            )
        ]

    # Abstract method, skipped for coverage
    @validate_call(config=model_config)  # pragma: no cover
    @abstractmethod  # pragma: no cover
    def load_dataset(self, dataset) -> None:  # pragma: no cover
        """Load a dataset.

        Args:
            dataset (DataLoader): The dataset to load.
        """
        ...

    @validate_call(config=model_config)
    def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
        """Deduplicate a DataFrame.

        Args:
            subset (Optional[list[str]]): The columns to consider for deduplication.

        Returns:
            pd.DataFrame: The deduplicated DataFrame.
        """
        if not self.df.empty:
            df = self.df.copy()
            df.drop_duplicates(
                subset=subset, keep="first", inplace=True, ignore_index=True
            )
            self.paths = df["path"].tolist()
            self.mime_types = df["mime_type"].tolist()
            self.names = df["name"].tolist()
            self.texts = df["text"].tolist()

    @validate_call(config=model_config)
    def show_duplicates(
        self, subset: Optional[list[str]] = None
    ) -> pd.DataFrame | None:
        """Show duplicates in a DataFrame.

        Args:
            subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

        Returns:
            pd.DataFrame: The DataFrame with duplicates.
        """
        if not self.df.empty:
            df = self.df.copy()
            return df[df.duplicated(subset=subset)]
        return None

    @validate_call(config=model_config)
    def to_csv(self, path: Path | str, **kwargs) -> None:
        """Save the data to a csv file.

        Args:
            path (Path | str): The path to save the csv file.
        """
        self.df.to_csv(path, **kwargs)

    @validate_call(config=model_config)
    def to_excel(self, path: Path | str, **kwargs) -> None:
        """Save the data to an Excel file.

        Args:
            path (Path | str): The path to save the csv file.
        """
        self.df.to_csv(path, **kwargs)

    @validate_call(config=model_config)
    def to_json(self, path: Path | str, **kwargs) -> None:
        """Save the data to a json file.

        Args:
            path (Path | str): The path to save the csv file.
        """
        self.df.to_json(path, **kwargs)

    @validate_call(config=model_config)
    def reset(self) -> None:
        """Reset the class attributes to empty lists."""
        self.paths = []
        self.mime_types = []
        self.names = []
        self.texts = []
        self.errors = []

data: dict[str, list] property ¤

Get the data as a dictionary.

Returns:

Type Description
dict[str, list]

dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

df: pd.DataFrame property ¤

Get a pandas DataFrame of file records.

Returns:

Type Description
DataFrame

pandas.DataFrame: A DataFrame containing file metadata and content.

errors: list = [] pydantic-field ¤

The list of loading errors.

mime_types: list = [] pydantic-field ¤

The list of text mime types.

names: list = [] pydantic-field ¤

The list of text names.

paths: list = [] pydantic-field ¤

The list of paths.

records: list[dict[str, str]] property ¤

Get a list of file records.

Returns:

Type Description
list[dict[str, str]]

list[dict]: List of dictionaries containing file metadata and content.

list[dict[str, str]]

Each dict has keys: path, mime_type, name, text

Raises:

Type Description
ValueError

If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

texts: list = [] pydantic-field ¤

The list of loaded texts.

__iter__() -> Generator[dict, None, None] ¤

Iterate through the records.

Source code in lexos/io/base_loader.py
def __iter__(self) -> Generator[dict, None, None]:
    """Iterate through the records."""
    return (record for record in self.records)

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

Deduplicate a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]]

The columns to consider for deduplication.

None

Returns:

Type Description
DataFrame

pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

load_dataset(dataset) -> None abstractmethod ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)  # pragma: no cover
@abstractmethod  # pragma: no cover
def load_dataset(self, dataset) -> None:  # pragma: no cover
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.
    """
    ...

reset() -> None ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

Show duplicates in a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]] = None

The columns to consider for checking duplicates.

None

Returns:

Type Description
DataFrame | None

pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

to_csv(path: Path | str, **kwargs) -> None ¤

Save the data to a csv file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_excel(path: Path | str, **kwargs) -> None ¤

Save the data to an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_json(path: Path | str, **kwargs) -> None ¤

Save the data to a json file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

__iter__() -> Generator[dict, None, None] ¤

Iterate through the records.

Source code in lexos/io/base_loader.py
def __iter__(self) -> Generator[dict, None, None]:
    """Iterate through the records."""
    return (record for record in self.records)

data: dict[str, list] property ¤

Get the data as a dictionary.

Returns:

Type Description
dict[str, list]

dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

df: pd.DataFrame property ¤

Get a pandas DataFrame of file records.

Returns:

Type Description
DataFrame

pandas.DataFrame: A DataFrame containing file metadata and content.

records: list[dict[str, str]] property ¤

Get a list of file records.

Returns:

Type Description
list[dict[str, str]]

list[dict]: List of dictionaries containing file metadata and content.

list[dict[str, str]]

Each dict has keys: path, mime_type, name, text

Raises:

Type Description
ValueError

If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

load_dataset(dataset) -> None abstractmethod ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)  # pragma: no cover
@abstractmethod  # pragma: no cover
def load_dataset(self, dataset) -> None:  # pragma: no cover
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.
    """
    ...

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

Deduplicate a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]]

The columns to consider for deduplication.

None

Returns:

Type Description
DataFrame

pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

reset() -> None ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

Show duplicates in a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]] = None

The columns to consider for checking duplicates.

None

Returns:

Type Description
DataFrame | None

pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

to_csv(path: Path | str, **kwargs) -> None ¤

Save the data to a csv file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_excel(path: Path | str, **kwargs) -> None ¤

Save the data to an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_json(path: Path | str, **kwargs) -> None ¤

Save the data to a json file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)