Skip to content

Smart¤

The smart Loader class is the primary component of the IO module, superseding the previous basic and advanced Loader.

lexos.io.smart.Loader ¤

Loader class.

Handles the queue for assets to be pipelined from their sources to text processing tools.

Source code in lexos\io\smart.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class Loader:
    """Loader class.

    Handles the queue for assets to be pipelined from their sources to
    text processing tools.
    """

    def __init__(self):
        """__init__ method."""
        self.source = None
        self.names = []
        self.locations = []
        self.texts = []
        self.errors = []
        self.decode = True

    def __iter__(self) -> Iterable:
        """Iterate over the loader.

        Returns:
            Iterable: The loader.
        """
        for i, _ in enumerate(iter(self.names)):
            yield Text(self.source, self.names[i], self.locations[i], self.texts[i])

    def _decode(self, text: Union[bytes, str]) -> str:
        """Decode a text.

        Args:
            text (Union[bytes, str]): The text to decode.

        Returns:
            str: The decoded text.
        """
        return utils._decode_bytes(text)

    def _add_text(self, path: str, text: Union[bytes, str]) -> None:
        """Decode and add a text.

        Args:
            path (str): The path to the text file.
            text (str): The text string.
        """
        try:
            if self.decode:
                self.texts.append(self._decode(text))
            else:
                self.texts.append(text)
            self.names.append(utils.ensure_path(path).stem)
            self.locations.append(path)
        except LexosException:
            self.errors.append({"path": path, "message": LANG["encoding_error"]})

    def _ensure_source(self, source: Union[List[Union[Path, str]], Path, str]) -> None:
        """Ensure that either the object or the method supplies a source value.

        Args:
            source (Union[List[Union[Path, str]], Path, str]): The source.

        Returns:
            None: None.

        Raises:
            LexosException: If no source is provided.
        """
        if source:
            self.source = source
        else:
            try:
                assert self.source is not None
            except AssertionError:
                raise LexosException(LANG["no_source"])

    def _handle_source(self, path: Union[Path, str]) -> None:
        """Add a text based on source type.

        Args:
            path (str): The path to the text file.
        """
        ext = utils.ensure_path(path).suffix
        path = str(path)
        if ext == ".zip":
            self._handle_zip(path)
        else:
            if ext in [".docx", ".pdf", ".zip"]:
                with open(path, "rb") as f:
                    bytes = io.BytesIO(f.read())
                if ext == ".docx":
                    self._add_text(path, docx2txt.process(bytes))
                elif ext == ".pdf":
                    self._add_text(path, extract_text(bytes))
                elif ext == ".zip":
                    self._handle_zip(path)
            else:
                with open(path, "rb") as f:
                    self._add_text(path, f.read())

    def _handle_zip(self, path: str) -> None:
        """Extract a zip file and add each text inside.

        Args:
            path (str): The path to the zip file.
        """
        with open(path, "rb") as f:
            with zipfile.ZipFile(f) as zip:
                namelist = [n for n in zip.namelist() if Path(n).suffix != ""]
                for info in namelist:
                    if not str(info).startswith("__MACOSX") and not str(
                        info
                    ).startswith(".ds_store"):
                        self._add_text(path, zip.read(info))

    def _validate_source(self, source: Any, is_valid: bool = True) -> bool:
        """Validate that the source is a string or Path.

        Args:
            source (Any): A source.

        Returns:
            bool: Whether the source is valid.
        """
        if not isinstance(source, str) and not isinstance(source, Path):
            is_valid = False
        return is_valid

    def load(
        self,
        source: Union[List[Union[Path, str]], Path, str] = None,
        decode: bool = True,
    ) -> None:
        """Load the source into a list of bytes and strings.

        Args:
            source (Union[List[Path, str], Path, str]): A source or list of sources.
            decode (bool): Whether to decode the source.
        """
        self._ensure_source(source)

        if decode:
            self.decode = decode

        if not isinstance(self.source, list):
            self.source = [self.source]

        for path in self.source:
            if self._validate_source(path):
                if "github.com" in str(path):
                    filepaths = utils.get_github_raw_paths(path)
                    for filepath in filepaths:
                        self._handle_source(filepath)
                elif utils.is_file(path) or utils.is_url(path):
                    self._handle_source(path)
                elif utils.is_dir(path):
                    for filepath in utils.ensure_path(path).rglob("*"):
                        self._handle_source(filepath)
                else:
                    self.errors.append({"path": path, "message": LANG["io_error"]})
            else:
                self.errors.append({"path": path, "message": LANG["format_error"]})

__init__() ¤

init method.

Source code in lexos\io\smart.py
40
41
42
43
44
45
46
47
def __init__(self):
    """__init__ method."""
    self.source = None
    self.names = []
    self.locations = []
    self.texts = []
    self.errors = []
    self.decode = True

__iter__() ¤

Iterate over the loader.

Returns:

Name Type Description
Iterable Iterable

The loader.

Source code in lexos\io\smart.py
49
50
51
52
53
54
55
56
def __iter__(self) -> Iterable:
    """Iterate over the loader.

    Returns:
        Iterable: The loader.
    """
    for i, _ in enumerate(iter(self.names)):
        yield Text(self.source, self.names[i], self.locations[i], self.texts[i])

load(source=None, decode=True) ¤

Load the source into a list of bytes and strings.

Parameters:

Name Type Description Default
source Union[List[Path, str], Path, str]

A source or list of sources.

None
decode bool

Whether to decode the source.

True
Source code in lexos\io\smart.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def load(
    self,
    source: Union[List[Union[Path, str]], Path, str] = None,
    decode: bool = True,
) -> None:
    """Load the source into a list of bytes and strings.

    Args:
        source (Union[List[Path, str], Path, str]): A source or list of sources.
        decode (bool): Whether to decode the source.
    """
    self._ensure_source(source)

    if decode:
        self.decode = decode

    if not isinstance(self.source, list):
        self.source = [self.source]

    for path in self.source:
        if self._validate_source(path):
            if "github.com" in str(path):
                filepaths = utils.get_github_raw_paths(path)
                for filepath in filepaths:
                    self._handle_source(filepath)
            elif utils.is_file(path) or utils.is_url(path):
                self._handle_source(path)
            elif utils.is_dir(path):
                for filepath in utils.ensure_path(path).rglob("*"):
                    self._handle_source(filepath)
            else:
                self.errors.append({"path": path, "message": LANG["io_error"]})
        else:
            self.errors.append({"path": path, "message": LANG["format_error"]})

lexos.io.smart.Text ¤

Class for accessing a text from an iterator.

Source code in lexos\io\smart.py
194
195
196
197
198
199
200
201
202
203
204
class Text:
    """Class for accessing a text from an iterator."""

    def __init__(
        self, source: str = None, name: str = "", location: str = "", text: str = ""
    ) -> None:
        """__init__ method."""
        self.source = source
        self.name = name
        self.location = location
        self.text = text

__init__(source=None, name='', location='', text='') ¤

init method.

Source code in lexos\io\smart.py
197
198
199
200
201
202
203
204
def __init__(
    self, source: str = None, name: str = "", location: str = "", text: str = ""
) -> None:
    """__init__ method."""
    self.source = source
    self.name = name
    self.location = location
    self.text = text