Skip to content

loader¤

The Loader Class¤

The Loader class is the main class for loading files in various formats. It tries to be "smart" detecting the format as well as can be done so that you can use a common interface to load content regardless of source.

VALID_FILE_TYPES = {*TEXT_TYPES, *PDF_TYPES, *DOCX_TYPES, *ZIP_TYPES} module-attribute ¤

Loader pydantic-model ¤

Bases: BaseLoader

Loader.

Config:

  • arbitrary_types_allowed: True

Fields:

Source code in lexos/io/loader.py
class Loader(BaseLoader):
    """Loader."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self):
        """Initialize the Loader."""
        super().__init__()

    def _get_mime_type(self, path: Path | str, file_start: str) -> str:
        """Get the mime type of a file.

        Args:
            path (Path | str): The path to the file.

        Returns:
            str: The mime type of the file.
        """
        if Path(path).suffix == ".pickle":
            return "application/vnd.python.pickle"
        results = puremagic.magic_string(file_start, path)
        if not results:
            return None
        else:
            mime_type = results[0].mime_type
            if mime_type == "":
                mime_type, _ = mimetypes.guess_type(path)
            return mime_type

    def _load_docx_file(self, path: Path | str) -> None:
        """Load a docx file.

        Args:
            path (Path | str): The path to the file.

        Note:
            Consider https://github.com/ShayHill/docx2python for greater coverage.
        """
        try:
            doc = Document(path)
            text = "\n".join([decode(p.text) for p in doc.paragraphs])
            self.names.append(Path(path).name)
            self.mime_types.append("application/docx")
            self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_pdf_file(self, path: Path | str) -> None:
        """Load a pdf file.

        Args:
            path (Path | str): The path to the file.
        """
        try:
            reader = PdfReader(path)
            for page in reader.pages:
                text = decode(page.extract_text())
                self.names.append(Path(path).name)
                self.mime_types.append("application/pdf")
                self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_text_file(self, path: Path | str, mime_type: str) -> None:
        """Load a text file.

        Args:
            path (Path | str): The path to the file.
            mime_type (str): The mime type of the file.
        """
        try:
            with open(path, "rb") as f:
                text = decode(f.read())
                self.paths.append(Path(path).name)
                self.names.append(Path(path).stem)
                self.mime_types.append(mime_type)
                self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_zip_file(self, path: Path | str) -> None:
        """Handle a zip file.

        Args:
            path (Path | str): The path to the file.
        """
        with open(path, "rb") as fin:
            with zipfile.ZipFile(fin) as zip:
                for info in zip.infolist():
                    try:
                        # Get the mime type of the file
                        file_bytes = zip.read(info.filename)
                        file_start = decode(file_bytes[:MIN_ENCODING_DETECT])
                        mime_type = self._get_mime_type(info.filename, file_start)
                    except (IOError, UnicodeDecodeError) as e:
                        self.errors.append(e)
                        mime_type = None
                    try:
                        if mime_type in VALID_FILE_TYPES:
                            text = decode(file_bytes)
                            self.paths.append(
                                Path(path).as_posix() + "/" + info.filename
                            )
                            self.names.append(Path(info.filename).stem)
                            self.mime_types.append(mime_type)
                            self.texts.append(text)
                        else:
                            self.errors.append(
                                f"Invalid MIME type: {mime_type} for file {info.filename}."
                            )
                    except BaseException as e:
                        self.errors.append(e)

    # @validate_call(config=model_config)
    def load_dataset(self, dataset: Self) -> None:
        """Load a dataset.

        Args:
            dataset (DataLoader): The dataset to load.

        Note: As of v2.10.5, Pydantic does not support recursive types (Self).
            As a result, this method performs its own check to see if the
            value of `dataset` is of type `DataLoader`.
        """
        if not isinstance(dataset, DataLoader):
            raise LexosException("Invalid dataset type.")
        self.paths = self.paths + dataset.paths
        self.mime_types = self.mime_types + dataset.mime_types
        self.names = self.names + dataset.names
        self.texts = self.texts + dataset.texts

    @validate_call(config=model_config)
    def load(self, paths: Path | str | list[Path | str]) -> None:
        """Load a list of paths.

        Args:
            paths (Path | str | list[Path | str]): The list of paths to load.
        """
        paths = ensure_list(paths)
        for path in paths:
            if Path(path).is_dir():
                paths = [p for p in Path(path).rglob("*")]
                self.load(paths)
            # Get the mime type of the file
            try:
                with open(path, "rb") as f:
                    file_start = f.read(FILE_START)
                mime_type = self._get_mime_type(path, file_start)
            except IOError as e:
                self.errors.append(e)
                mime_type = None
            if mime_type in TEXT_TYPES:
                self._load_text_file(path, mime_type)
            elif mime_type in PDF_TYPES:
                self._load_pdf_file(path)
            elif mime_type in DOCX_TYPES:
                self._load_docx_file(path)
            elif mime_type in ZIP_TYPES:
                self._load_zip_file(path)
            else:
                self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

    @validate_call(config=model_config)
    def loads(
        self,
        texts: Optional[list[Path | str]] = None,
        names: Optional[list[str]] = None,
        start: Optional[int] = 1,
        zero_pad: Optional[str] = "03",
    ) -> None:
        """Load a list of texts.

        Args:
            texts (Optional[list[Path | str]]): The list of texts to load.
            names (Optional[list[str]]): The list of names for the texts.
            start (Optional[int]): The starting index for the names if no list is provided.
            zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
        """
        texts = ensure_list(texts)
        if names is None:
            names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
        for i, text in enumerate(texts):
            self.names.append(names[i])
            self.mime_types.append("text/plain")
            self.texts.append(text)

data: dict[str, list] property ¤

Get the data as a dictionary.

Returns:

Type Description
dict[str, list]

dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

df: pd.DataFrame property ¤

Get a pandas DataFrame of file records.

Returns:

Type Description
DataFrame

pandas.DataFrame: A DataFrame containing file metadata and content.

errors: list = [] pydantic-field ¤

The list of loading errors.

mime_types: list = [] pydantic-field ¤

The list of text mime types.

names: list = [] pydantic-field ¤

The list of text names.

paths: list = [] pydantic-field ¤

The list of paths.

records: list[dict[str, str]] property ¤

Get a list of file records.

Returns:

Type Description
list[dict[str, str]]

list[dict]: List of dictionaries containing file metadata and content.

list[dict[str, str]]

Each dict has keys: path, mime_type, name, text

Raises:

Type Description
ValueError

If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

texts: list = [] pydantic-field ¤

The list of loaded texts.

__init__() ¤

Initialize the Loader.

Source code in lexos/io/loader.py
def __init__(self):
    """Initialize the Loader."""
    super().__init__()

__iter__() -> Generator[dict, None, None] ¤

Iterate through the records.

Source code in lexos/io/base_loader.py
def __iter__(self) -> Generator[dict, None, None]:
    """Iterate through the records."""
    return (record for record in self.records)

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

Deduplicate a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]]

The columns to consider for deduplication.

None

Returns:

Type Description
DataFrame

pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

load(paths: Path | str | list[Path | str]) -> None ¤

Load a list of paths.

Parameters:

Name Type Description Default
paths Path | str | list[Path | str]

The list of paths to load.

required
Source code in lexos/io/loader.py
@validate_call(config=model_config)
def load(self, paths: Path | str | list[Path | str]) -> None:
    """Load a list of paths.

    Args:
        paths (Path | str | list[Path | str]): The list of paths to load.
    """
    paths = ensure_list(paths)
    for path in paths:
        if Path(path).is_dir():
            paths = [p for p in Path(path).rglob("*")]
            self.load(paths)
        # Get the mime type of the file
        try:
            with open(path, "rb") as f:
                file_start = f.read(FILE_START)
            mime_type = self._get_mime_type(path, file_start)
        except IOError as e:
            self.errors.append(e)
            mime_type = None
        if mime_type in TEXT_TYPES:
            self._load_text_file(path, mime_type)
        elif mime_type in PDF_TYPES:
            self._load_pdf_file(path)
        elif mime_type in DOCX_TYPES:
            self._load_docx_file(path)
        elif mime_type in ZIP_TYPES:
            self._load_zip_file(path)
        else:
            self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

load_dataset(dataset: Self) -> None ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataLoader.

Source code in lexos/io/loader.py
def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataLoader`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

Load a list of texts.

Parameters:

Name Type Description Default
texts Optional[list[Path | str]]

The list of texts to load.

None
names Optional[list[str]]

The list of names for the texts.

None
start Optional[int]

The starting index for the names if no list is provided.

1
zero_pad Optional[str]

The zero padding for the names increments if no list is provided.

'03'
Source code in lexos/io/loader.py
@validate_call(config=model_config)
def loads(
    self,
    texts: Optional[list[Path | str]] = None,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        texts (Optional[list[Path | str]]): The list of texts to load.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    texts = ensure_list(texts)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    for i, text in enumerate(texts):
        self.names.append(names[i])
        self.mime_types.append("text/plain")
        self.texts.append(text)

reset() -> None ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

Show duplicates in a DataFrame.

Parameters:

Name Type Description Default
subset Optional[list[str]] = None

The columns to consider for checking duplicates.

None

Returns:

Type Description
DataFrame | None

pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

to_csv(path: Path | str, **kwargs) -> None ¤

Save the data to a csv file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_excel(path: Path | str, **kwargs) -> None ¤

Save the data to an Excel file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

to_json(path: Path | str, **kwargs) -> None ¤

Save the data to a json file.

Parameters:

Name Type Description Default
path Path | str

The path to save the csv file.

required
Source code in lexos/io/base_loader.py
@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

__init__() ¤

Initialize the Loader.

Source code in lexos/io/loader.py
def __init__(self):
    """Initialize the Loader."""
    super().__init__()

_get_mime_type(path: Path | str, file_start: str) -> str ¤

Get the mime type of a file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required

Returns:

Name Type Description
str str

The mime type of the file.

Source code in lexos/io/loader.py
def _get_mime_type(self, path: Path | str, file_start: str) -> str:
    """Get the mime type of a file.

    Args:
        path (Path | str): The path to the file.

    Returns:
        str: The mime type of the file.
    """
    if Path(path).suffix == ".pickle":
        return "application/vnd.python.pickle"
    results = puremagic.magic_string(file_start, path)
    if not results:
        return None
    else:
        mime_type = results[0].mime_type
        if mime_type == "":
            mime_type, _ = mimetypes.guess_type(path)
        return mime_type

_load_docx_file(path: Path | str) -> None ¤

Load a docx file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
Note

Consider https://github.com/ShayHill/docx2python for greater coverage.

Source code in lexos/io/loader.py
def _load_docx_file(self, path: Path | str) -> None:
    """Load a docx file.

    Args:
        path (Path | str): The path to the file.

    Note:
        Consider https://github.com/ShayHill/docx2python for greater coverage.
    """
    try:
        doc = Document(path)
        text = "\n".join([decode(p.text) for p in doc.paragraphs])
        self.names.append(Path(path).name)
        self.mime_types.append("application/docx")
        self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

_load_pdf_file(path: Path | str) -> None ¤

Load a pdf file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
Source code in lexos/io/loader.py
def _load_pdf_file(self, path: Path | str) -> None:
    """Load a pdf file.

    Args:
        path (Path | str): The path to the file.
    """
    try:
        reader = PdfReader(path)
        for page in reader.pages:
            text = decode(page.extract_text())
            self.names.append(Path(path).name)
            self.mime_types.append("application/pdf")
            self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

_load_text_file(path: Path | str, mime_type: str) -> None ¤

Load a text file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
mime_type str

The mime type of the file.

required
Source code in lexos/io/loader.py
def _load_text_file(self, path: Path | str, mime_type: str) -> None:
    """Load a text file.

    Args:
        path (Path | str): The path to the file.
        mime_type (str): The mime type of the file.
    """
    try:
        with open(path, "rb") as f:
            text = decode(f.read())
            self.paths.append(Path(path).name)
            self.names.append(Path(path).stem)
            self.mime_types.append(mime_type)
            self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

_load_zip_file(path: Path | str) -> None ¤

Handle a zip file.

Parameters:

Name Type Description Default
path Path | str

The path to the file.

required
Source code in lexos/io/loader.py
def _load_zip_file(self, path: Path | str) -> None:
    """Handle a zip file.

    Args:
        path (Path | str): The path to the file.
    """
    with open(path, "rb") as fin:
        with zipfile.ZipFile(fin) as zip:
            for info in zip.infolist():
                try:
                    # Get the mime type of the file
                    file_bytes = zip.read(info.filename)
                    file_start = decode(file_bytes[:MIN_ENCODING_DETECT])
                    mime_type = self._get_mime_type(info.filename, file_start)
                except (IOError, UnicodeDecodeError) as e:
                    self.errors.append(e)
                    mime_type = None
                try:
                    if mime_type in VALID_FILE_TYPES:
                        text = decode(file_bytes)
                        self.paths.append(
                            Path(path).as_posix() + "/" + info.filename
                        )
                        self.names.append(Path(info.filename).stem)
                        self.mime_types.append(mime_type)
                        self.texts.append(text)
                    else:
                        self.errors.append(
                            f"Invalid MIME type: {mime_type} for file {info.filename}."
                        )
                except BaseException as e:
                    self.errors.append(e)

load_dataset(dataset: Self) -> None ¤

Load a dataset.

Parameters:

Name Type Description Default
dataset DataLoader

The dataset to load.

required
As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataLoader.

Source code in lexos/io/loader.py
def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataLoader`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

load(paths: Path | str | list[Path | str]) -> None ¤

Load a list of paths.

Parameters:

Name Type Description Default
paths Path | str | list[Path | str]

The list of paths to load.

required
Source code in lexos/io/loader.py
@validate_call(config=model_config)
def load(self, paths: Path | str | list[Path | str]) -> None:
    """Load a list of paths.

    Args:
        paths (Path | str | list[Path | str]): The list of paths to load.
    """
    paths = ensure_list(paths)
    for path in paths:
        if Path(path).is_dir():
            paths = [p for p in Path(path).rglob("*")]
            self.load(paths)
        # Get the mime type of the file
        try:
            with open(path, "rb") as f:
                file_start = f.read(FILE_START)
            mime_type = self._get_mime_type(path, file_start)
        except IOError as e:
            self.errors.append(e)
            mime_type = None
        if mime_type in TEXT_TYPES:
            self._load_text_file(path, mime_type)
        elif mime_type in PDF_TYPES:
            self._load_pdf_file(path)
        elif mime_type in DOCX_TYPES:
            self._load_docx_file(path)
        elif mime_type in ZIP_TYPES:
            self._load_zip_file(path)
        else:
            self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

Load a list of texts.

Parameters:

Name Type Description Default
texts Optional[list[Path | str]]

The list of texts to load.

None
names Optional[list[str]]

The list of names for the texts.

None
start Optional[int]

The starting index for the names if no list is provided.

1
zero_pad Optional[str]

The zero padding for the names increments if no list is provided.

'03'
Source code in lexos/io/loader.py
@validate_call(config=model_config)
def loads(
    self,
    texts: Optional[list[Path | str]] = None,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        texts (Optional[list[Path | str]]): The list of texts to load.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    texts = ensure_list(texts)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    for i, text in enumerate(texts):
        self.names.append(names[i])
        self.mime_types.append("text/plain")
        self.texts.append(text)