Skip to content

Dataset¤

The DatasetLoader class is used to load multiple texts stored in a single file.

lexos.io.dataset.Dataset ¤

Bases: BaseModel

Dataset class.

Source code in lexos\io\dataset.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
class Dataset(BaseModel):
    """Dataset class."""

    data: Optional[List[Dict[str, str]]] = None

    class Config:
        """Config class."""

        arbitrary_types_allowed = True

    @property
    def locations(self) -> List[str]:
        """Return the locations of the object data.

        Returns:
            List[str]: The locations of the object data.
        """
        if any("location" in item for item in self.data):
            return [item["locations"] for item in self.data]
        else:
            return None

    @property
    def names(self) -> List[str]:
        """Return the names of the object data.

        Returns:
            List[str]: The names of the object data.
        """
        return [item["title"] for item in self.data]

    @property
    def texts(self) -> List[str]:
        """Return the texts of the object data.

        Returns:
            List[str]: The texts of the object data.
        """
        return [item["text"] for item in self.data]

    def __iter__(self) -> Iterable:
        """Iterate over the dataset.

        Returns:
            Iterable: The dataset.
        """
        for item in iter(self.data):
            yield item

    def __getitem__(self, item: int) -> Dict[str, str]:
        """Get an item from dataset.

        Args:
            item: The index of the item to get.

        Returns:
            Dict[str, str]: The item at the given index.
        """
        return self.data[item]

    def df(self) -> pd.DataFrame:
        """Return the dataframe of the object data.

        Returns:
            pd.DataFrame: The dataframe of the object data.
        """
        return pd.DataFrame(self.data)

    @classmethod
    def parse_csv(
        cls: Type["Model"],
        source: str,
        title_col: Optional[str] = None,
        text_col: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> "Model":
        """Parse CSV/TSV texts into the Dataset object.

        Args:
            source (str): The string or path to file containing the texts to parse.
            title_col (Optional[str]): The column name to convert to "title".
            text_col (Optional[str]): The column name to convert to "text".

        Returns:
            Model: A dataset object.
        """
        source = cls._get_file_like(source)
        df = pd.read_csv(source, **kwargs)
        if title_col:
            df = df.rename(columns={title_col: "title"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "title" not in df.columns or "text" not in df.columns:
            err = (
                "CSV and TSV files must contain headers named `title` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`title_col` and `text_col` parameters.",
            )
            raise LexosException("".join(err))
        return cls.parse_obj({"data": df.to_dict(orient="records")})

    @classmethod
    def parse_dict(cls: Type["Model"], source: dict,) -> "Model":
        """Alias for cls.parse_obj().

        Args:
            source (dict): The dict to parse.

        Returns:
            Model: A dataset object.
        """
        return cls.parse_obj({"data": source})

    @classmethod
    def parse_excel(
        cls: Type["Model"],
        source: str,
        title_col: Optional[str] = None,
        text_col: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> "Model":
        """Parse Excel files into the Dataset object.

        Args:
            source (str): The path to the Excel file containing the texts to parse.
            title_col (Optional[str]): The column name to convert to "title".
            text_col (Optional[str]): The column name to convert to "text".

        Returns:
            Model: A dataset object.
        """
        try:
            df = pd.read_excel(source, **kwargs)
        except Exception as e:
            raise LexosException(f"Could not read {source}: {e}")
        if title_col:
            df = df.rename(columns={title_col: "title"})
        if text_col:
            df = df.rename(columns={text_col: "text"})
        if "title" not in df.columns or "text" not in df.columns:
            err = (
                "Excel files must contain headers named `title` and `text`. ",
                "You can convert the names of existing headers to these with the ",
                "`title_col` and `text_col` parameters.",
            )
            raise LexosException(err)
        return cls.parse_obj({"data": df.to_dict(orient="records")})

    @classmethod
    def parse_json(
        cls: Type["Model"],
        source: str,
        title_field: Optional[str] = None,
        text_field: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> "Model":
        """Parse JSON files or strings.

        Args:
            source (str): The json string to parse.
            title_field (Optional[str]): The field name to convert to "title".
            text_field (Optional[str]): The field name to convert to "text".

        Returns:
            Model: A dataset object.
        """
        try:
            with open(source) as f:
                df = pd.read_json(f, **kwargs)
        except Exception:
            df = pd.read_json(io.StringIO(source), **kwargs)
        if title_field:
            df = df.rename(columns={title_field: "title"})
        if text_field:
            df = df.rename(columns={text_field: "text"})
        if "title" not in df.columns or "text" not in df.columns:
            err = (
                "JSON files must contain fields named `title` and `text`. ",
                "You can convert the names of existing fields to these with the ",
                "`title_field` and `text_field` parameters.",
            )
            raise LexosException(err)
        return cls.parse_obj({"data": df.to_dict(orient="records")})

    @classmethod
    def parse_jsonl(
        cls: Type["Model"],
        source: str,
        title_field: Optional[str] = None,
        text_field: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> "Model":
        """Parse lineated texts into the Dataset object.

        Args:
            source (str): The string or path to file containing the lines to parse.
            title_field (Optional[str]): The field name to convert to "title".
            text_field (Optional[str]): The field name to convert to "text".

        Returns:
            Model: A dataset object.
        """
        source = cls._get_file_like(source)
        df = pd.read_json(source, lines=True, **kwargs)
        if title_field:
            df = df.rename(columns={title_field: "title"})
        if text_field:
            df = df.rename(columns={text_field: "text"})
        if "title" not in df.columns or "text" not in df.columns:
            err = (
                "JSON and JSONL files must contain fields named `title` and `text`. ",
                "You can convert the names of existing fields to these with the ",
                "`title_field` and `text_field` parameters.",
            )
            raise LexosException(err)
        return cls.parse_obj({"data": df.to_dict(orient="records")})

    @classmethod
    def parse_string(
        cls: Type["Model"],
        source: str,
        labels: Optional[List[str]] = None,
        locations: Optional[List[str]] = None,
    ) -> "Model":
        """Parse lineated texts into the Dataset object.

        Args:
            source (str): The string containing the lines to parse.
            labels (Optional[List[str]]): The names of the texts.
            locations (Optional[List[str]]): The locations of the texts.

        Returns:
            Model: A dataset object.
        """
        if not labels:
            raise LexosException(
                "Please use the `labels` argument to provide a list of labels for each row in your data."
            )
        # Handle files
        try:
            with open(source, "r", encoding="utf-8") as f:
                source = f.readlines()
        # Handle strings
        except Exception:
            source = source.split("\n")
        if len(labels) != len(source):
            raise LexosException(
                f"The number of labels ({len(labels)}) does not match the number of lines ({len(source)}) in your data."
            )
        else:
            data = [{"title": labels[i], "text": line} for i, line in enumerate(source)]
            if locations:
                if len(locations) == len(source):
                    for i, _ in enumerate(data):
                        data[i]["locations"] = locations[i]
                else:
                    raise LexosException(
                        f"The number of locations ({len(locations)}) does not match the number of lines ({len(source)}) in your data."
                    )
            return cls.parse_obj({"data": data})

    @staticmethod
    def _get_file_like(source: str) -> IO[AnyStr]:
        """Read the source into a buffer.

        Args:
            source: str: A path or string containing the source.

        Returns:
            IO[AnyStr]: A file-like object containing the source.
        """
        if utils.is_file(source) or utils.is_github_dir(source) == False:
            try:
                with open(source, "r", encoding="utf-8") as f:
                    source = f.read()
            except:
                pass
            return io.StringIO(source)
        else:
            raise LexosException(f"{source} is not a valid file path or input string.")

Config ¤

Config class.

Source code in lexos\io\dataset.py
46
47
48
49
class Config:
    """Config class."""

    arbitrary_types_allowed = True

__getitem__(item) ¤

Get an item from dataset.

Parameters:

Name Type Description Default
item int

The index of the item to get.

required

Returns:

Type Description
Dict[str, str]

Dict[str, str]: The item at the given index.

Source code in lexos\io\dataset.py
90
91
92
93
94
95
96
97
98
99
def __getitem__(self, item: int) -> Dict[str, str]:
    """Get an item from dataset.

    Args:
        item: The index of the item to get.

    Returns:
        Dict[str, str]: The item at the given index.
    """
    return self.data[item]

__iter__() ¤

Iterate over the dataset.

Returns:

Name Type Description
Iterable Iterable

The dataset.

Source code in lexos\io\dataset.py
81
82
83
84
85
86
87
88
def __iter__(self) -> Iterable:
    """Iterate over the dataset.

    Returns:
        Iterable: The dataset.
    """
    for item in iter(self.data):
        yield item

df() ¤

Return the dataframe of the object data.

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The dataframe of the object data.

Source code in lexos\io\dataset.py
101
102
103
104
105
106
107
def df(self) -> pd.DataFrame:
    """Return the dataframe of the object data.

    Returns:
        pd.DataFrame: The dataframe of the object data.
    """
    return pd.DataFrame(self.data)

locations() property ¤

Return the locations of the object data.

Returns:

Type Description
List[str]

List[str]: The locations of the object data.

Source code in lexos\io\dataset.py
51
52
53
54
55
56
57
58
59
60
61
@property
def locations(self) -> List[str]:
    """Return the locations of the object data.

    Returns:
        List[str]: The locations of the object data.
    """
    if any("location" in item for item in self.data):
        return [item["locations"] for item in self.data]
    else:
        return None

names() property ¤

Return the names of the object data.

Returns:

Type Description
List[str]

List[str]: The names of the object data.

Source code in lexos\io\dataset.py
63
64
65
66
67
68
69
70
@property
def names(self) -> List[str]:
    """Return the names of the object data.

    Returns:
        List[str]: The names of the object data.
    """
    return [item["title"] for item in self.data]

parse_csv(source, title_col=None, text_col=None, **kwargs) classmethod ¤

Parse CSV/TSV texts into the Dataset object.

Parameters:

Name Type Description Default
source str

The string or path to file containing the texts to parse.

required
title_col Optional[str]

The column name to convert to "title".

None
text_col Optional[str]

The column name to convert to "text".

None

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@classmethod
def parse_csv(
    cls: Type["Model"],
    source: str,
    title_col: Optional[str] = None,
    text_col: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> "Model":
    """Parse CSV/TSV texts into the Dataset object.

    Args:
        source (str): The string or path to file containing the texts to parse.
        title_col (Optional[str]): The column name to convert to "title".
        text_col (Optional[str]): The column name to convert to "text".

    Returns:
        Model: A dataset object.
    """
    source = cls._get_file_like(source)
    df = pd.read_csv(source, **kwargs)
    if title_col:
        df = df.rename(columns={title_col: "title"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "title" not in df.columns or "text" not in df.columns:
        err = (
            "CSV and TSV files must contain headers named `title` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`title_col` and `text_col` parameters.",
        )
        raise LexosException("".join(err))
    return cls.parse_obj({"data": df.to_dict(orient="records")})

parse_dict(source) classmethod ¤

Alias for cls.parse_obj().

Parameters:

Name Type Description Default
source dict

The dict to parse.

required

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
142
143
144
145
146
147
148
149
150
151
152
@classmethod
def parse_dict(cls: Type["Model"], source: dict,) -> "Model":
    """Alias for cls.parse_obj().

    Args:
        source (dict): The dict to parse.

    Returns:
        Model: A dataset object.
    """
    return cls.parse_obj({"data": source})

parse_excel(source, title_col=None, text_col=None, **kwargs) classmethod ¤

Parse Excel files into the Dataset object.

Parameters:

Name Type Description Default
source str

The path to the Excel file containing the texts to parse.

required
title_col Optional[str]

The column name to convert to "title".

None
text_col Optional[str]

The column name to convert to "text".

None

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@classmethod
def parse_excel(
    cls: Type["Model"],
    source: str,
    title_col: Optional[str] = None,
    text_col: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> "Model":
    """Parse Excel files into the Dataset object.

    Args:
        source (str): The path to the Excel file containing the texts to parse.
        title_col (Optional[str]): The column name to convert to "title".
        text_col (Optional[str]): The column name to convert to "text".

    Returns:
        Model: A dataset object.
    """
    try:
        df = pd.read_excel(source, **kwargs)
    except Exception as e:
        raise LexosException(f"Could not read {source}: {e}")
    if title_col:
        df = df.rename(columns={title_col: "title"})
    if text_col:
        df = df.rename(columns={text_col: "text"})
    if "title" not in df.columns or "text" not in df.columns:
        err = (
            "Excel files must contain headers named `title` and `text`. ",
            "You can convert the names of existing headers to these with the ",
            "`title_col` and `text_col` parameters.",
        )
        raise LexosException(err)
    return cls.parse_obj({"data": df.to_dict(orient="records")})

parse_json(source, title_field=None, text_field=None, **kwargs) classmethod ¤

Parse JSON files or strings.

Parameters:

Name Type Description Default
source str

The json string to parse.

required
title_field Optional[str]

The field name to convert to "title".

None
text_field Optional[str]

The field name to convert to "text".

None

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@classmethod
def parse_json(
    cls: Type["Model"],
    source: str,
    title_field: Optional[str] = None,
    text_field: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> "Model":
    """Parse JSON files or strings.

    Args:
        source (str): The json string to parse.
        title_field (Optional[str]): The field name to convert to "title".
        text_field (Optional[str]): The field name to convert to "text".

    Returns:
        Model: A dataset object.
    """
    try:
        with open(source) as f:
            df = pd.read_json(f, **kwargs)
    except Exception:
        df = pd.read_json(io.StringIO(source), **kwargs)
    if title_field:
        df = df.rename(columns={title_field: "title"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "title" not in df.columns or "text" not in df.columns:
        err = (
            "JSON files must contain fields named `title` and `text`. ",
            "You can convert the names of existing fields to these with the ",
            "`title_field` and `text_field` parameters.",
        )
        raise LexosException(err)
    return cls.parse_obj({"data": df.to_dict(orient="records")})

parse_jsonl(source, title_field=None, text_field=None, **kwargs) classmethod ¤

Parse lineated texts into the Dataset object.

Parameters:

Name Type Description Default
source str

The string or path to file containing the lines to parse.

required
title_field Optional[str]

The field name to convert to "title".

None
text_field Optional[str]

The field name to convert to "text".

None

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
@classmethod
def parse_jsonl(
    cls: Type["Model"],
    source: str,
    title_field: Optional[str] = None,
    text_field: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> "Model":
    """Parse lineated texts into the Dataset object.

    Args:
        source (str): The string or path to file containing the lines to parse.
        title_field (Optional[str]): The field name to convert to "title".
        text_field (Optional[str]): The field name to convert to "text".

    Returns:
        Model: A dataset object.
    """
    source = cls._get_file_like(source)
    df = pd.read_json(source, lines=True, **kwargs)
    if title_field:
        df = df.rename(columns={title_field: "title"})
    if text_field:
        df = df.rename(columns={text_field: "text"})
    if "title" not in df.columns or "text" not in df.columns:
        err = (
            "JSON and JSONL files must contain fields named `title` and `text`. ",
            "You can convert the names of existing fields to these with the ",
            "`title_field` and `text_field` parameters.",
        )
        raise LexosException(err)
    return cls.parse_obj({"data": df.to_dict(orient="records")})

parse_string(source, labels=None, locations=None) classmethod ¤

Parse lineated texts into the Dataset object.

Parameters:

Name Type Description Default
source str

The string containing the lines to parse.

required
labels Optional[List[str]]

The names of the texts.

None
locations Optional[List[str]]

The locations of the texts.

None

Returns:

Name Type Description
Model Model

A dataset object.

Source code in lexos\io\dataset.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
@classmethod
def parse_string(
    cls: Type["Model"],
    source: str,
    labels: Optional[List[str]] = None,
    locations: Optional[List[str]] = None,
) -> "Model":
    """Parse lineated texts into the Dataset object.

    Args:
        source (str): The string containing the lines to parse.
        labels (Optional[List[str]]): The names of the texts.
        locations (Optional[List[str]]): The locations of the texts.

    Returns:
        Model: A dataset object.
    """
    if not labels:
        raise LexosException(
            "Please use the `labels` argument to provide a list of labels for each row in your data."
        )
    # Handle files
    try:
        with open(source, "r", encoding="utf-8") as f:
            source = f.readlines()
    # Handle strings
    except Exception:
        source = source.split("\n")
    if len(labels) != len(source):
        raise LexosException(
            f"The number of labels ({len(labels)}) does not match the number of lines ({len(source)}) in your data."
        )
    else:
        data = [{"title": labels[i], "text": line} for i, line in enumerate(source)]
        if locations:
            if len(locations) == len(source):
                for i, _ in enumerate(data):
                    data[i]["locations"] = locations[i]
            else:
                raise LexosException(
                    f"The number of locations ({len(locations)}) does not match the number of lines ({len(source)}) in your data."
                )
        return cls.parse_obj({"data": data})

texts() property ¤

Return the texts of the object data.

Returns:

Type Description
List[str]

List[str]: The texts of the object data.

Source code in lexos\io\dataset.py
72
73
74
75
76
77
78
79
@property
def texts(self) -> List[str]:
    """Return the texts of the object data.

    Returns:
        List[str]: The texts of the object data.
    """
    return [item["text"] for item in self.data]

lexos.io.dataset.DatasetLoader ¤

Loads a dataset.

Usage

loader = DatasetLoader(source) dataset = loader.dataset

Notes
  • Different types of data may require different keyword parameters. Error messages provide some help in identifying what keywords are required.
  • The class will handle lists of sources, but errors may occur if the sources are of different formats or require different arguments or argument values.
Source code in lexos\io\dataset.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
class DatasetLoader:
    """Loads a dataset.

    Usage:
        loader = DatasetLoader(source)
        dataset = loader.dataset

    Notes:
      - Different types of data may require different keyword parameters. Error messages
        provide some help in identifying what keywords are required.
      - The class will handle lists of sources, but errors may occur if the sources are
        of different formats or require different arguments or argument values.
    """

    def __init__(
        self,
        source: Any,
        labels: List[str] = None,
        locations: Optional[List[str]] = None,
        title_col: Optional[str] = None,
        text_col: Optional[str] = None,
        title_field: Optional[str] = None,
        text_field: Optional[str] = None,
        location_col: Optional[str] = None,
        location_field: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> Union[Dataset, List[Dataset]]:
        """Initialise the loader.

        Args:
            source (Any): The source type to detect.
            labels (List[str]): The labels to use.
            locations (Optional[List[str]]): The locations of the texts.
            title_col (str): The name of the column containing the titles.
            text_col (str): The name of the column containing the texts.
            title_field (str): The name of the field containing the titles.
            text_field (str): The name of the field containing the texts.
            location_col (str): The name of the column containing the locations.
            location_field (str): The name of the field containing the locations.

        Returns:
            Dataset: A Dataset or list of Dataset object.
        """
        if isinstance(source, list):
            new_data = [
                self.load(
                    item,
                    labels,
                    locations,
                    title_col,
                    text_col,
                    title_field,
                    text_field,
                    location_col,
                    location_field,
                    **kwargs,
                ).data
                for item in source
            ]
            # Data a Dataset with the flattened list of dicts
            self.data = Dataset(data=list(itertools.chain(*new_data)))
        else:
            self.data = self.load(
                source,
                labels,
                locations,
                title_col,
                text_col,
                title_field,
                text_field,
                location_col,
                location_field,
                **kwargs,
            )

    @property
    def locations(self) -> List[str]:
        """Return the locations of the object data.

        Returns:
            List[str]: The locations of the object data.
        """
        if any("location" in item for item in self.data):
            return [item["locations"] for item in self.data]
        else:
            return None

    @property
    def names(self) -> List[str]:
        """Return the names of the object data.

        Returns:
            List[str]: The names of the object data.
        """
        return [item["title"] for item in self.data]

    @property
    def texts(self) -> List[str]:
        """Return the texts of the object data.

        Returns:
            List[str]: The names of the object data.
        """
        return [item["text"] for item in self.data]

    def __iter__(self) -> Iterable:
        """Iterate over the dataset.

        Returns:
            Iterable: The dataset.
        """
        for item in iter(self.data):
            yield item

    def __getitem__(self, item: int) -> Dict[str, str]:
        """Get an item from dataset.

        Args:
            item: The index of the item to get.

        Returns:
            Dict[str, str]: The item at the given index.
        """
        return self.data[item]

    def load(
        self,
        source: Any,
        labels: List[str] = None,
        locations: Optional[List[str]] = None,
        title_col: Optional[str] = None,
        text_col: Optional[str] = None,
        title_field: Optional[str] = None,
        text_field: Optional[str] = None,
        location_col: Optional[str] = None,
        location_field: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> Dataset:
        """Load the given file.

        Args:
            source (Any): The source the data to load.
            labels (List[str]): The labels to use.
            locations (Optional[List[str]]): The locations of the texts.
            title_col (str): The name of the column containing the titles.
            text_col (str): The name of the column containing the texts.
            title_field (str): The name of the field containing the titles.
            text_field (str): The name of the field containing the texts.
            location_col (str): The name of the column containing the locations.
            location_field (str): The name of the field containing the locations.

        Returns:
            Dataset: A Data object.
        """
        if not utils.is_dir(source) and not utils.is_github_dir(
            source
        ):  # and not utils.is_url(source):
            ext = Path(source).suffix
            if ext == "" or ext == ".txt":
                return Dataset.parse_string(source, labels, locations)
            elif ext == ".csv":
                return Dataset.parse_csv(source, title_col, text_col, **kwargs)
            elif ext == ".tsv":
                return Dataset.parse_csv(source, title_col, text_col, **kwargs)
            elif ext == ".xlsx":
                return Dataset.parse_excel(source, title_col, text_col, **kwargs)
            elif ext == ".json":
                return Dataset.parse_json(source, title_field, text_field, **kwargs)
            elif ext == ".jsonl":
                return Dataset.parse_jsonl(source, title_field, text_field, **kwargs)
            elif ext == ".zip":
                return self._load_zip(
                    source,
                    labels,
                    locations,
                    title_col,
                    text_col,
                    title_field,
                    text_field,
                    location_col,
                    location_field,
                    *kwargs,
                )
        elif utils.is_dir(source) or utils.is_github_dir(source):
            new_data = []
            if utils.is_github_dir(source):
                paths = utils.get_github_raw_paths(source)
            else:
                paths = utils.get_paths(source)
            for path in paths:
                new_data.append(
                    self.load(
                        path,
                        labels,
                        locations,
                        title_col,
                        text_col,
                        title_field,
                        text_field,
                        location_col,
                        location_field,
                        **kwargs,
                    )
                )
            # Return a Dataset with the flattened list of dicts
            return Dataset(data=list(itertools.chain(*new_data)))
        else:
            raise LexosException(
                f"{source} is an unknown source type or requires different arguments than the other sources in the directory."
            )

    def _load_zip(
        self,
        file_path: str,
        labels: List[str] = None,
        locations: Optional[List[str]] = None,
        title_col: Optional[str] = None,
        text_col: Optional[str] = None,
        title_field: Optional[str] = None,
        text_field: Optional[str] = None,
        location_col: Optional[str] = None,
        location_field: Optional[str] = None,
        **kwargs: Dict[str, str],
    ) -> Dataset:
        """
        Load a zip file.

        Args:
            file_path (str): The path to the file to load.
            source (Any): The source the data to load.
            labels (List[str]): The labels to use.
            locations (Optional[List[str]]): The locations of the texts.
            title_col (str): The name of the column containing the titles.
            text_col (str): The name of the column containing the texts.
            title_field (str): The name of the field containing the titles.
            text_field (str): The name of the field containing the texts.
            location_col (str): The name of the column containing the locations.
            location_field (str): The name of the field containing the locations.

        Returns:
            Dataset: A Data object.
        """
        new_data = []
        with open(file_path, "rb") as f:
            with zipfile.ZipFile(f) as zip:
                with tempfile.TemporaryDirectory() as tempdir:
                    zip.extractall(tempdir)
                    for tmp_path in Path(tempdir).glob("**/*"):
                        if (
                            tmp_path.is_file()
                            and not tmp_path.suffix == ""
                            and not str(tmp_path).startswith("__MACOSX")
                            and not str(tmp_path).startswith(".ds_store")
                        ):
                            new_data.append(
                                self.load(
                                    tmp_path,
                                    labels,
                                    locations,
                                    title_col,
                                    text_col,
                                    title_field,
                                    text_field,
                                    location_col,
                                    location_field,
                                    **kwargs,
                                ).data
                            )
        # Return a Dataset with the flattened list of dicts
        return Dataset(data=list(itertools.chain(*new_data)))

__getitem__(item) ¤

Get an item from dataset.

Parameters:

Name Type Description Default
item int

The index of the item to get.

required

Returns:

Type Description
Dict[str, str]

Dict[str, str]: The item at the given index.

Source code in lexos\io\dataset.py
437
438
439
440
441
442
443
444
445
446
def __getitem__(self, item: int) -> Dict[str, str]:
    """Get an item from dataset.

    Args:
        item: The index of the item to get.

    Returns:
        Dict[str, str]: The item at the given index.
    """
    return self.data[item]

__init__(source, labels=None, locations=None, title_col=None, text_col=None, title_field=None, text_field=None, location_col=None, location_field=None, **kwargs) ¤

Initialise the loader.

Parameters:

Name Type Description Default
source Any

The source type to detect.

required
labels List[str]

The labels to use.

None
locations Optional[List[str]]

The locations of the texts.

None
title_col str

The name of the column containing the titles.

None
text_col str

The name of the column containing the texts.

None
title_field str

The name of the field containing the titles.

None
text_field str

The name of the field containing the texts.

None
location_col str

The name of the column containing the locations.

None
location_field str

The name of the field containing the locations.

None

Returns:

Name Type Description
Dataset Union[Dataset, List[Dataset]]

A Dataset or list of Dataset object.

Source code in lexos\io\dataset.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
def __init__(
    self,
    source: Any,
    labels: List[str] = None,
    locations: Optional[List[str]] = None,
    title_col: Optional[str] = None,
    text_col: Optional[str] = None,
    title_field: Optional[str] = None,
    text_field: Optional[str] = None,
    location_col: Optional[str] = None,
    location_field: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> Union[Dataset, List[Dataset]]:
    """Initialise the loader.

    Args:
        source (Any): The source type to detect.
        labels (List[str]): The labels to use.
        locations (Optional[List[str]]): The locations of the texts.
        title_col (str): The name of the column containing the titles.
        text_col (str): The name of the column containing the texts.
        title_field (str): The name of the field containing the titles.
        text_field (str): The name of the field containing the texts.
        location_col (str): The name of the column containing the locations.
        location_field (str): The name of the field containing the locations.

    Returns:
        Dataset: A Dataset or list of Dataset object.
    """
    if isinstance(source, list):
        new_data = [
            self.load(
                item,
                labels,
                locations,
                title_col,
                text_col,
                title_field,
                text_field,
                location_col,
                location_field,
                **kwargs,
            ).data
            for item in source
        ]
        # Data a Dataset with the flattened list of dicts
        self.data = Dataset(data=list(itertools.chain(*new_data)))
    else:
        self.data = self.load(
            source,
            labels,
            locations,
            title_col,
            text_col,
            title_field,
            text_field,
            location_col,
            location_field,
            **kwargs,
        )

__iter__() ¤

Iterate over the dataset.

Returns:

Name Type Description
Iterable Iterable

The dataset.

Source code in lexos\io\dataset.py
428
429
430
431
432
433
434
435
def __iter__(self) -> Iterable:
    """Iterate over the dataset.

    Returns:
        Iterable: The dataset.
    """
    for item in iter(self.data):
        yield item

load(source, labels=None, locations=None, title_col=None, text_col=None, title_field=None, text_field=None, location_col=None, location_field=None, **kwargs) ¤

Load the given file.

Parameters:

Name Type Description Default
source Any

The source the data to load.

required
labels List[str]

The labels to use.

None
locations Optional[List[str]]

The locations of the texts.

None
title_col str

The name of the column containing the titles.

None
text_col str

The name of the column containing the texts.

None
title_field str

The name of the field containing the titles.

None
text_field str

The name of the field containing the texts.

None
location_col str

The name of the column containing the locations.

None
location_field str

The name of the field containing the locations.

None

Returns:

Name Type Description
Dataset Dataset

A Data object.

Source code in lexos\io\dataset.py
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def load(
    self,
    source: Any,
    labels: List[str] = None,
    locations: Optional[List[str]] = None,
    title_col: Optional[str] = None,
    text_col: Optional[str] = None,
    title_field: Optional[str] = None,
    text_field: Optional[str] = None,
    location_col: Optional[str] = None,
    location_field: Optional[str] = None,
    **kwargs: Dict[str, str],
) -> Dataset:
    """Load the given file.

    Args:
        source (Any): The source the data to load.
        labels (List[str]): The labels to use.
        locations (Optional[List[str]]): The locations of the texts.
        title_col (str): The name of the column containing the titles.
        text_col (str): The name of the column containing the texts.
        title_field (str): The name of the field containing the titles.
        text_field (str): The name of the field containing the texts.
        location_col (str): The name of the column containing the locations.
        location_field (str): The name of the field containing the locations.

    Returns:
        Dataset: A Data object.
    """
    if not utils.is_dir(source) and not utils.is_github_dir(
        source
    ):  # and not utils.is_url(source):
        ext = Path(source).suffix
        if ext == "" or ext == ".txt":
            return Dataset.parse_string(source, labels, locations)
        elif ext == ".csv":
            return Dataset.parse_csv(source, title_col, text_col, **kwargs)
        elif ext == ".tsv":
            return Dataset.parse_csv(source, title_col, text_col, **kwargs)
        elif ext == ".xlsx":
            return Dataset.parse_excel(source, title_col, text_col, **kwargs)
        elif ext == ".json":
            return Dataset.parse_json(source, title_field, text_field, **kwargs)
        elif ext == ".jsonl":
            return Dataset.parse_jsonl(source, title_field, text_field, **kwargs)
        elif ext == ".zip":
            return self._load_zip(
                source,
                labels,
                locations,
                title_col,
                text_col,
                title_field,
                text_field,
                location_col,
                location_field,
                *kwargs,
            )
    elif utils.is_dir(source) or utils.is_github_dir(source):
        new_data = []
        if utils.is_github_dir(source):
            paths = utils.get_github_raw_paths(source)
        else:
            paths = utils.get_paths(source)
        for path in paths:
            new_data.append(
                self.load(
                    path,
                    labels,
                    locations,
                    title_col,
                    text_col,
                    title_field,
                    text_field,
                    location_col,
                    location_field,
                    **kwargs,
                )
            )
        # Return a Dataset with the flattened list of dicts
        return Dataset(data=list(itertools.chain(*new_data)))
    else:
        raise LexosException(
            f"{source} is an unknown source type or requires different arguments than the other sources in the directory."
        )

locations() property ¤

Return the locations of the object data.

Returns:

Type Description
List[str]

List[str]: The locations of the object data.

Source code in lexos\io\dataset.py
398
399
400
401
402
403
404
405
406
407
408
@property
def locations(self) -> List[str]:
    """Return the locations of the object data.

    Returns:
        List[str]: The locations of the object data.
    """
    if any("location" in item for item in self.data):
        return [item["locations"] for item in self.data]
    else:
        return None

names() property ¤

Return the names of the object data.

Returns:

Type Description
List[str]

List[str]: The names of the object data.

Source code in lexos\io\dataset.py
410
411
412
413
414
415
416
417
@property
def names(self) -> List[str]:
    """Return the names of the object data.

    Returns:
        List[str]: The names of the object data.
    """
    return [item["title"] for item in self.data]

texts() property ¤

Return the texts of the object data.

Returns:

Type Description
List[str]

List[str]: The names of the object data.

Source code in lexos\io\dataset.py
419
420
421
422
423
424
425
426
@property
def texts(self) -> List[str]:
    """Return the texts of the object data.

    Returns:
        List[str]: The names of the object data.
    """
    return [item["text"] for item in self.data]