loader¤

The `Loader` Class¤

The Loader class is the main class for loading files in various formats. It tries to be "smart" detecting the format as well as can be done so that you can use a common interface to load content regardless of source.

`VALID_FILE_TYPES = {TEXT_TYPES, PDF_TYPES, DOCX_TYPES, ZIP_TYPES}` `module-attribute` ¤

`Loader` `pydantic-model` ¤

Bases: BaseLoader

Loader.

Config:

arbitrary_types_allowed: True

Fields:

paths (list)
mime_types (list)
names (list)
texts (list)
errors (list)

Source code in lexos/io/loader.py

class Loader(BaseLoader):
    """Loader."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self):
        """Initialize the Loader."""
        super().__init__()

    def _get_mime_type(self, path: Path | str, file_start: str) -> str:
        """Get the mime type of a file.

        Args:
            path (Path | str): The path to the file.

        Returns:
            str: The mime type of the file.
        """
        if Path(path).suffix == ".pickle":
            return "application/vnd.python.pickle"
        results = puremagic.magic_string(file_start, path)
        if not results:
            return None
        else:
            mime_type = results[0].mime_type
            if mime_type == "":
                mime_type, _ = mimetypes.guess_type(path)
            return mime_type

    def _load_docx_file(self, path: Path | str) -> None:
        """Load a docx file.

        Args:
            path (Path | str): The path to the file.

        Note:
            Consider https://github.com/ShayHill/docx2python for greater coverage.
        """
        try:
            doc = Document(path)
            text = "\n".join([decode(p.text) for p in doc.paragraphs])
            self.names.append(Path(path).name)
            self.mime_types.append("application/docx")
            self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_pdf_file(self, path: Path | str) -> None:
        """Load a pdf file.

        Args:
            path (Path | str): The path to the file.
        """
        try:
            reader = PdfReader(path)
            for page in reader.pages:
                text = decode(page.extract_text())
                self.names.append(Path(path).name)
                self.mime_types.append("application/pdf")
                self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_text_file(self, path: Path | str, mime_type: str) -> None:
        """Load a text file.

        Args:
            path (Path | str): The path to the file.
            mime_type (str): The mime type of the file.
        """
        try:
            with open(path, "rb") as f:
                text = decode(f.read())
                self.paths.append(Path(path).name)
                self.names.append(Path(path).stem)
                self.mime_types.append(mime_type)
                self.texts.append(text)
        except BaseException as e:
            self.errors.append(e)

    def _load_zip_file(self, path: Path | str) -> None:
        """Handle a zip file.

        Args:
            path (Path | str): The path to the file.
        """
        with open(path, "rb") as fin:
            with zipfile.ZipFile(fin) as zip:
                for info in zip.infolist():
                    try:
                        # Get the mime type of the file
                        file_bytes = zip.read(info.filename)
                        file_start = decode(file_bytes[:MIN_ENCODING_DETECT])
                        mime_type = self._get_mime_type(info.filename, file_start)
                    except (IOError, UnicodeDecodeError) as e:
                        self.errors.append(e)
                        mime_type = None
                    try:
                        if mime_type in VALID_FILE_TYPES:
                            text = decode(file_bytes)
                            self.paths.append(
                                Path(path).as_posix() + "/" + info.filename
                            )
                            self.names.append(Path(info.filename).stem)
                            self.mime_types.append(mime_type)
                            self.texts.append(text)
                        else:
                            self.errors.append(
                                f"Invalid MIME type: {mime_type} for file {info.filename}."
                            )
                    except BaseException as e:
                        self.errors.append(e)

    # @validate_call(config=model_config)
    def load_dataset(self, dataset: Self) -> None:
        """Load a dataset.

        Args:
            dataset (DataLoader): The dataset to load.

        Note: As of v2.10.5, Pydantic does not support recursive types (Self).
            As a result, this method performs its own check to see if the
            value of `dataset` is of type `DataLoader`.
        """
        if not isinstance(dataset, DataLoader):
            raise LexosException("Invalid dataset type.")
        self.paths = self.paths + dataset.paths
        self.mime_types = self.mime_types + dataset.mime_types
        self.names = self.names + dataset.names
        self.texts = self.texts + dataset.texts

    @validate_call(config=model_config)
    def load(self, paths: Path | str | list[Path | str]) -> None:
        """Load a list of paths.

        Args:
            paths (Path | str | list[Path | str]): The list of paths to load.
        """
        paths = ensure_list(paths)
        for path in paths:
            if Path(path).is_dir():
                paths = [p for p in Path(path).rglob("*")]
                self.load(paths)
            # Get the mime type of the file
            try:
                with open(path, "rb") as f:
                    file_start = f.read(FILE_START)
                mime_type = self._get_mime_type(path, file_start)
            except IOError as e:
                self.errors.append(e)
                mime_type = None
            if mime_type in TEXT_TYPES:
                self._load_text_file(path, mime_type)
            elif mime_type in PDF_TYPES:
                self._load_pdf_file(path)
            elif mime_type in DOCX_TYPES:
                self._load_docx_file(path)
            elif mime_type in ZIP_TYPES:
                self._load_zip_file(path)
            else:
                self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

    @validate_call(config=model_config)
    def loads(
        self,
        texts: Optional[list[Path | str]] = None,
        names: Optional[list[str]] = None,
        start: Optional[int] = 1,
        zero_pad: Optional[str] = "03",
    ) -> None:
        """Load a list of texts.

        Args:
            texts (Optional[list[Path | str]]): The list of texts to load.
            names (Optional[list[str]]): The list of names for the texts.
            start (Optional[int]): The starting index for the names if no list is provided.
            zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
        """
        texts = ensure_list(texts)
        if names is None:
            names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
        for i, text in enumerate(texts):
            self.names.append(names[i])
            self.mime_types.append("text/plain")
            self.texts.append(text)

`data: dict[str, list]` `property` ¤

Get the data as a dictionary.

Returns:

Type	Description
`dict[str, list]`	dict[str, list]: A dictionary containing the paths, mime_types, names, texts, and errors.

`df: pd.DataFrame` `property` ¤

Get a pandas DataFrame of file records.

Returns:

Type	Description
`DataFrame`	pandas.DataFrame: A DataFrame containing file metadata and content.

`errors: list = []` `pydantic-field` ¤

The list of loading errors.

`mime_types: list = []` `pydantic-field` ¤

The list of text mime types.

`names: list = []` `pydantic-field` ¤

The list of text names.

`paths: list = []` `pydantic-field` ¤

The list of paths.

`records: list[dict[str, str]]` `property` ¤

Get a list of file records.

Returns:

Type	Description
`list[dict[str, str]]`	list[dict]: List of dictionaries containing file metadata and content.
`list[dict[str, str]]`	Each dict has keys: path, mime_type, name, text

Raises:

Type	Description
`ValueError`	If the lengths of paths, mime_types, names and texts don't match.

Note

Validates that all lists have the same length before returning the records.

`texts: list = []` `pydantic-field` ¤

The list of loaded texts.

`init()` ¤

Initialize the Loader.

Source code in lexos/io/loader.py

def __init__(self):
    """Initialize the Loader."""
    super().__init__()

`iter() -> Generator[dict, None, None]` ¤

Iterate through the records.

Source code in lexos/io/base_loader.py

def __iter__(self) -> Generator[dict, None, None]:
    """Iterate through the records."""
    return (record for record in self.records)

`dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame` ¤

Deduplicate a DataFrame.

Parameters:

Name	Type	Description	Default
`subset`	`Optional[list[str]]`	The columns to consider for deduplication.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The deduplicated DataFrame.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def dedupe(self, subset: Optional[list[str]] = None) -> pd.DataFrame:
    """Deduplicate a DataFrame.

    Args:
        subset (Optional[list[str]]): The columns to consider for deduplication.

    Returns:
        pd.DataFrame: The deduplicated DataFrame.
    """
    if not self.df.empty:
        df = self.df.copy()
        df.drop_duplicates(
            subset=subset, keep="first", inplace=True, ignore_index=True
        )
        self.paths = df["path"].tolist()
        self.mime_types = df["mime_type"].tolist()
        self.names = df["name"].tolist()
        self.texts = df["text"].tolist()

`load(paths: Path | str | list[Path | str]) -> None` ¤

Load a list of paths.

Parameters:

Name	Type	Description	Default
`paths`	`Path \| str \| list[Path \| str]`	The list of paths to load.	required

Source code in lexos/io/loader.py

@validate_call(config=model_config)
def load(self, paths: Path | str | list[Path | str]) -> None:
    """Load a list of paths.

    Args:
        paths (Path | str | list[Path | str]): The list of paths to load.
    """
    paths = ensure_list(paths)
    for path in paths:
        if Path(path).is_dir():
            paths = [p for p in Path(path).rglob("*")]
            self.load(paths)
        # Get the mime type of the file
        try:
            with open(path, "rb") as f:
                file_start = f.read(FILE_START)
            mime_type = self._get_mime_type(path, file_start)
        except IOError as e:
            self.errors.append(e)
            mime_type = None
        if mime_type in TEXT_TYPES:
            self._load_text_file(path, mime_type)
        elif mime_type in PDF_TYPES:
            self._load_pdf_file(path)
        elif mime_type in DOCX_TYPES:
            self._load_docx_file(path)
        elif mime_type in ZIP_TYPES:
            self._load_zip_file(path)
        else:
            self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

`load_dataset(dataset: Self) -> None` ¤

Load a dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`DataLoader`	The dataset to load.	required

As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataLoader.

Source code in lexos/io/loader.py

def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataLoader`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

`loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

Load a list of texts.

Parameters:

Name	Type	Description	Default
`texts`	`Optional[list[Path \| str]]`	The list of texts to load.	`None`
`names`	`Optional[list[str]]`	The list of names for the texts.	`None`
`start`	`Optional[int]`	The starting index for the names if no list is provided.	`1`
`zero_pad`	`Optional[str]`	The zero padding for the names increments if no list is provided.	`'03'`

Source code in lexos/io/loader.py

@validate_call(config=model_config)
def loads(
    self,
    texts: Optional[list[Path | str]] = None,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        texts (Optional[list[Path | str]]): The list of texts to load.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    texts = ensure_list(texts)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    for i, text in enumerate(texts):
        self.names.append(names[i])
        self.mime_types.append("text/plain")
        self.texts.append(text)

`reset() -> None` ¤

Reset the class attributes to empty lists.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def reset(self) -> None:
    """Reset the class attributes to empty lists."""
    self.paths = []
    self.mime_types = []
    self.names = []
    self.texts = []
    self.errors = []

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

Show duplicates in a DataFrame.

Parameters:

Name	Type	Description	Default
`subset`	`Optional[list[str]] = None`	The columns to consider for checking duplicates.	`None`

Returns:

Type	Description
`DataFrame \| None`	pd.DataFrame: The DataFrame with duplicates.

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def show_duplicates(
    self, subset: Optional[list[str]] = None
) -> pd.DataFrame | None:
    """Show duplicates in a DataFrame.

    Args:
        subset (Optional[list[str]] = None): The columns to consider for checking duplicates.

    Returns:
        pd.DataFrame: The DataFrame with duplicates.
    """
    if not self.df.empty:
        df = self.df.copy()
        return df[df.duplicated(subset=subset)]
    return None

`to_csv(path: Path | str, **kwargs) -> None` ¤

Save the data to a csv file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_csv(self, path: Path | str, **kwargs) -> None:
    """Save the data to a csv file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_excel(path: Path | str, **kwargs) -> None` ¤

Save the data to an Excel file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_excel(self, path: Path | str, **kwargs) -> None:
    """Save the data to an Excel file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_csv(path, **kwargs)

`to_json(path: Path | str, **kwargs) -> None` ¤

Save the data to a json file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to save the csv file.	required

Source code in lexos/io/base_loader.py

@validate_call(config=model_config)
def to_json(self, path: Path | str, **kwargs) -> None:
    """Save the data to a json file.

    Args:
        path (Path | str): The path to save the csv file.
    """
    self.df.to_json(path, **kwargs)

`init()` ¤

Initialize the Loader.

Source code in lexos/io/loader.py

def __init__(self):
    """Initialize the Loader."""
    super().__init__()

`_get_mime_type(path: Path | str, file_start: str) -> str` ¤

Get the mime type of a file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required

Returns:

Name	Type	Description
`str`	`str`	The mime type of the file.

Source code in lexos/io/loader.py

def _get_mime_type(self, path: Path | str, file_start: str) -> str:
    """Get the mime type of a file.

    Args:
        path (Path | str): The path to the file.

    Returns:
        str: The mime type of the file.
    """
    if Path(path).suffix == ".pickle":
        return "application/vnd.python.pickle"
    results = puremagic.magic_string(file_start, path)
    if not results:
        return None
    else:
        mime_type = results[0].mime_type
        if mime_type == "":
            mime_type, _ = mimetypes.guess_type(path)
        return mime_type

`_load_docx_file(path: Path | str) -> None` ¤

Load a docx file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required

Note

Consider https://github.com/ShayHill/docx2python for greater coverage.

Source code in lexos/io/loader.py

def _load_docx_file(self, path: Path | str) -> None:
    """Load a docx file.

    Args:
        path (Path | str): The path to the file.

    Note:
        Consider https://github.com/ShayHill/docx2python for greater coverage.
    """
    try:
        doc = Document(path)
        text = "\n".join([decode(p.text) for p in doc.paragraphs])
        self.names.append(Path(path).name)
        self.mime_types.append("application/docx")
        self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

`_load_pdf_file(path: Path | str) -> None` ¤

Load a pdf file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required

Source code in lexos/io/loader.py

def _load_pdf_file(self, path: Path | str) -> None:
    """Load a pdf file.

    Args:
        path (Path | str): The path to the file.
    """
    try:
        reader = PdfReader(path)
        for page in reader.pages:
            text = decode(page.extract_text())
            self.names.append(Path(path).name)
            self.mime_types.append("application/pdf")
            self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

`_load_text_file(path: Path | str, mime_type: str) -> None` ¤

Load a text file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required
`mime_type`	`str`	The mime type of the file.	required

Source code in lexos/io/loader.py

def _load_text_file(self, path: Path | str, mime_type: str) -> None:
    """Load a text file.

    Args:
        path (Path | str): The path to the file.
        mime_type (str): The mime type of the file.
    """
    try:
        with open(path, "rb") as f:
            text = decode(f.read())
            self.paths.append(Path(path).name)
            self.names.append(Path(path).stem)
            self.mime_types.append(mime_type)
            self.texts.append(text)
    except BaseException as e:
        self.errors.append(e)

`_load_zip_file(path: Path | str) -> None` ¤

Handle a zip file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	The path to the file.	required

Source code in lexos/io/loader.py

def _load_zip_file(self, path: Path | str) -> None:
    """Handle a zip file.

    Args:
        path (Path | str): The path to the file.
    """
    with open(path, "rb") as fin:
        with zipfile.ZipFile(fin) as zip:
            for info in zip.infolist():
                try:
                    # Get the mime type of the file
                    file_bytes = zip.read(info.filename)
                    file_start = decode(file_bytes[:MIN_ENCODING_DETECT])
                    mime_type = self._get_mime_type(info.filename, file_start)
                except (IOError, UnicodeDecodeError) as e:
                    self.errors.append(e)
                    mime_type = None
                try:
                    if mime_type in VALID_FILE_TYPES:
                        text = decode(file_bytes)
                        self.paths.append(
                            Path(path).as_posix() + "/" + info.filename
                        )
                        self.names.append(Path(info.filename).stem)
                        self.mime_types.append(mime_type)
                        self.texts.append(text)
                    else:
                        self.errors.append(
                            f"Invalid MIME type: {mime_type} for file {info.filename}."
                        )
                except BaseException as e:
                    self.errors.append(e)

`load_dataset(dataset: Self) -> None` ¤

Load a dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`DataLoader`	The dataset to load.	required

As of v2.10.5, Pydantic does not support recursive types (Self).

As a result, this method performs its own check to see if the value of dataset is of type DataLoader.

Source code in lexos/io/loader.py

def load_dataset(self, dataset: Self) -> None:
    """Load a dataset.

    Args:
        dataset (DataLoader): The dataset to load.

    Note: As of v2.10.5, Pydantic does not support recursive types (Self).
        As a result, this method performs its own check to see if the
        value of `dataset` is of type `DataLoader`.
    """
    if not isinstance(dataset, DataLoader):
        raise LexosException("Invalid dataset type.")
    self.paths = self.paths + dataset.paths
    self.mime_types = self.mime_types + dataset.mime_types
    self.names = self.names + dataset.names
    self.texts = self.texts + dataset.texts

`load(paths: Path | str | list[Path | str]) -> None` ¤

Load a list of paths.

Parameters:

Name	Type	Description	Default
`paths`	`Path \| str \| list[Path \| str]`	The list of paths to load.	required

Source code in lexos/io/loader.py

@validate_call(config=model_config)
def load(self, paths: Path | str | list[Path | str]) -> None:
    """Load a list of paths.

    Args:
        paths (Path | str | list[Path | str]): The list of paths to load.
    """
    paths = ensure_list(paths)
    for path in paths:
        if Path(path).is_dir():
            paths = [p for p in Path(path).rglob("*")]
            self.load(paths)
        # Get the mime type of the file
        try:
            with open(path, "rb") as f:
                file_start = f.read(FILE_START)
            mime_type = self._get_mime_type(path, file_start)
        except IOError as e:
            self.errors.append(e)
            mime_type = None
        if mime_type in TEXT_TYPES:
            self._load_text_file(path, mime_type)
        elif mime_type in PDF_TYPES:
            self._load_pdf_file(path)
        elif mime_type in DOCX_TYPES:
            self._load_docx_file(path)
        elif mime_type in ZIP_TYPES:
            self._load_zip_file(path)
        else:
            self.errors.append(f"Invalid MIME type: {mime_type} for file {path}.")

`loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

Load a list of texts.

Parameters:

Name	Type	Description	Default
`texts`	`Optional[list[Path \| str]]`	The list of texts to load.	`None`
`names`	`Optional[list[str]]`	The list of names for the texts.	`None`
`start`	`Optional[int]`	The starting index for the names if no list is provided.	`1`
`zero_pad`	`Optional[str]`	The zero padding for the names increments if no list is provided.	`'03'`

Source code in lexos/io/loader.py

@validate_call(config=model_config)
def loads(
    self,
    texts: Optional[list[Path | str]] = None,
    names: Optional[list[str]] = None,
    start: Optional[int] = 1,
    zero_pad: Optional[str] = "03",
) -> None:
    """Load a list of texts.

    Args:
        texts (Optional[list[Path | str]]): The list of texts to load.
        names (Optional[list[str]]): The list of names for the texts.
        start (Optional[int]): The starting index for the names if no list is provided.
        zero_pad (Optional[str]): The zero padding for the names increments if no list is provided.
    """
    texts = ensure_list(texts)
    if names is None:
        names = [f"text{i + start:{zero_pad}d}" for i in range(len(texts))]
    for i, text in enumerate(texts):
        self.names.append(names[i])
        self.mime_types.append("text/plain")
        self.texts.append(text)

loader¤

The Loader Class¤

VALID_FILE_TYPES = {*TEXT_TYPES, *PDF_TYPES, *DOCX_TYPES, *ZIP_TYPES} module-attribute ¤

Loader pydantic-model ¤

data: dict[str, list] property ¤

df: pd.DataFrame property ¤

errors: list = [] pydantic-field ¤

mime_types: list = [] pydantic-field ¤

names: list = [] pydantic-field ¤

paths: list = [] pydantic-field ¤

records: list[dict[str, str]] property ¤

texts: list = [] pydantic-field ¤

__init__() ¤

__iter__() -> Generator[dict, None, None] ¤

dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame ¤

load(paths: Path | str | list[Path | str]) -> None ¤

load_dataset(dataset: Self) -> None ¤

loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

reset() -> None ¤

show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None ¤

to_csv(path: Path | str, **kwargs) -> None ¤

to_excel(path: Path | str, **kwargs) -> None ¤

to_json(path: Path | str, **kwargs) -> None ¤

__init__() ¤

_get_mime_type(path: Path | str, file_start: str) -> str ¤

_load_docx_file(path: Path | str) -> None ¤

_load_pdf_file(path: Path | str) -> None ¤

_load_text_file(path: Path | str, mime_type: str) -> None ¤

_load_zip_file(path: Path | str) -> None ¤

load_dataset(dataset: Self) -> None ¤

load(paths: Path | str | list[Path | str]) -> None ¤

loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None ¤

The `Loader` Class¤

`VALID_FILE_TYPES = {TEXT_TYPES, PDF_TYPES, DOCX_TYPES, ZIP_TYPES}` `module-attribute` ¤

`Loader` `pydantic-model` ¤

`data: dict[str, list]` `property` ¤

`df: pd.DataFrame` `property` ¤

`errors: list = []` `pydantic-field` ¤

`mime_types: list = []` `pydantic-field` ¤

`names: list = []` `pydantic-field` ¤

`paths: list = []` `pydantic-field` ¤

`records: list[dict[str, str]]` `property` ¤

`texts: list = []` `pydantic-field` ¤

`init()` ¤

`iter() -> Generator[dict, None, None]` ¤

`dedupe(subset: Optional[list[str]] = None) -> pd.DataFrame` ¤

`load(paths: Path | str | list[Path | str]) -> None` ¤

`load_dataset(dataset: Self) -> None` ¤

`loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤

`reset() -> None` ¤

`show_duplicates(subset: Optional[list[str]] = None) -> pd.DataFrame | None` ¤

`to_csv(path: Path | str, **kwargs) -> None` ¤

`to_excel(path: Path | str, **kwargs) -> None` ¤

`to_json(path: Path | str, **kwargs) -> None` ¤

`init()` ¤

`_get_mime_type(path: Path | str, file_start: str) -> str` ¤

`_load_docx_file(path: Path | str) -> None` ¤

`_load_pdf_file(path: Path | str) -> None` ¤

`_load_text_file(path: Path | str, mime_type: str) -> None` ¤

`_load_zip_file(path: Path | str) -> None` ¤

`load_dataset(dataset: Self) -> None` ¤

`load(paths: Path | str | list[Path | str]) -> None` ¤

`loads(texts: Optional[list[Path | str]] = None, names: Optional[list[str]] = None, start: Optional[int] = 1, zero_pad: Optional[str] = '03') -> None` ¤