Skip to content

LexosDoc¤

A wrapper class for a spaCy doc which allows for extra methods.

A convenience that allows you to use Doc extensions without the underscore prefix.

Note

There is probably no need for this class. We can just keep a library of functions in a file called tokenizer.py and import them. If certain functions get used commonly, they can be turned into Doc extensions.

lexos.tokenizer.lexosdoc.LexosDoc ¤

A wrapper class for a spaCy doc which allows for extra methods.

A convenience that allows you to use Doc extensions without the underscore prefix.

Note: There is probably no need for this class. We can just keep a library of functions in a file called tokenizer.py and import them. If certain functions get used commonly, they can be turned into Doc extensions.

Source code in lexos\tokenizer\lexosdoc.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class LexosDoc:
    """A wrapper class for a spaCy doc which allows for extra methods.

    A convenience that allows you to use Doc extensions without the
    underscore prefix.

    Note: There is probably no need for this class. We can just keep a
    library of functions in a file called `tokenizer.py` and import them.
    If certain functions get used commonly, they can be turned into Doc
    extensions.
    """

    def __init__(self, doc: object):
        """Initialize a LexosDoc object."""
        if isinstance(doc, spacy.tokens.doc.Doc):
            self.doc = doc
        else:
            raise LexosException("The input must be a spaCy doc.")

    def get_term_counts(
        self,
        limit: int = None,
        start: Any = 0,
        end: Any = None,
        filters: List[Union[Dict[str, str], str]] = None,
        regex: bool = False,
        normalize: bool = False,
        normalize_with_filters: bool = False,
        as_df=False,
    ) -> Union[List, pd.DataFrame]:
        """Get a list of word counts for each token in the doc.

        Args:
            self (object): A spaCy doc.
            limit (int): The maximum number of tokens to count.
            start (Any): The index of the first token to count.
            end (Any): The index of the last token to count after limit is applied.
            filters (List[Union[Dict[str, str], str]]): A list of Doc attributes to ignore.
            regex (bool): Whether to match the dictionary value using regex.
            normalize (bool): Whether to return raw counts or relative frequencies.
            normalize_with_filters (bool): Whether to normalize based on the number
                of tokens after filters are applied.
            as_df (bool): Whether to return a pandas dataframe.

        Returns:
            Union[List, pd.DataFrame]: A list of word count tuples for
            each token in the doc. Alternatively, a pandas dataframe.
        """
        tokens = []
        bool_filters = []
        dict_filters = {}
        if filters:
            self._validate_filters(filters)
            for filter in filters:
                if isinstance(filter, dict):
                    dict_filters[list(filter.keys())[0]] = list(filter.values())[0]
                else:
                    bool_filters.append(filter)
        tokens = [
            token.text
            for token in self.doc
            if self._bool_filter(token, bool_filters)
            and self._dict_filter(token, dict_filters, regex=regex)
        ]
        term_counts = Counter(tokens).most_common(limit)[start:end]
        columns = ["term", "count"]
        if normalize_with_filters:
            normalize = True
            num_tokens = len(tokens)
        else:
            num_tokens = len(self.doc)
        if normalize:
            term_counts = [(x[0], x[1] / num_tokens) for x in term_counts]
            columns[1] = "frequency"
        if as_df:
            return self._dataframe(term_counts, columns)
        else:
            return term_counts

    def get_tokens(self):
        """Return a list of tokens in the doc."""
        return [token.text for token in self.doc]

    def get_token_attrs(self):
        """Get a list of attributes for each token in the doc.

        Returns a dict with "spacy_attributes" and "extensions".

        Note: This function relies on sampling the first token in a doc
        to compile the list of attributes. It does not check for consistency.
        Currently, it is up to the user to reconcile inconsistencies between
        docs.
        """
        sample = self.doc[0]
        attrs = sorted([x for x in dir(sample) if not x.startswith("__") and x != "_"])
        exts = sorted(
            [f"_{x}" for x in dir(sample._) if x not in ["get", "has", "set"]]
        )
        return {"spacy_attributes": attrs, "extensions": exts}

    def to_dataframe(
        self, cols: List[str] = ["text"], show_ranges: bool = True
    ) -> pd.DataFrame:
        """Get a pandas dataframe of the doc attributes.

        Args:
            cols: A list of columns to include in the dataframe.
            show_ranges: Whether to include the token start and end positions in the dataframe.

        Returns a pandas dataframe of the doc attributes.

        Note: It is a good idea to call `LexosDoc.get_token_attrs()` first
        to check which attributes are available for the doc.
        """
        rows = []
        for i, token in enumerate(self.doc):
            t = []
            for col in cols:
                t.append(getattr(token, col))
            if show_ranges:
                ranges = self.doc.to_json()["tokens"][i]
                t.append(ranges["start"])
                t.append(ranges["end"])
            rows.append(t)
        if show_ranges:
            cols = cols + ["start", "end"]
        return self._dataframe(rows, cols)

    def _bool_filter(self, token: object, filters: List[str]) -> bool:
        """Filter a token based on a list of boolean filters.

        Args:
            token (object): A spaCy token.
            filters (str): A list of boolean filters (the names of spaCy token attributes).

        Returns:
            bool: Whether the token passes the filters.
        """
        if filters and filters != []:
            for filter in filters:
                if getattr(token, filter):
                    return False
                else:
                    return True
        else:
            return True

    def _dataframe(self, rows: List[dict], columns: List[str]) -> pd.DataFrame:
        """Return a pandas dataframe of the doc attributes.

        Args:
            rows (List[dict]): A list of dicts with the doc attributes.
            columns (List[str]): A list of column names.

        Returns:
            pd.DataFrame: A pandas dataframe of the doc attributes.

        Raises:
            LexosException: If a pandas exception occurs.
        """
        try:
            return pd.DataFrame(rows, columns=columns)
        except Exception as e:
            raise LexosException(e)

    def _dict_filter(
        self, token: object, filters: List[Dict[str, str]], regex: bool = False
    ) -> bool:
        """Filter a token based on a list of dictionary filters.

        Args:
            token (object): A spaCy token.
            filters (List[Dict[str, str]]): A list of filter dictionaries with keys
            as spaCy token attributes.
            regex (bool): Whether to match the dictionary value using regex.

        Returns:
            bool: Whether the token passes the filters.
        """
        if not isinstance(token, spacy.tokens.Token):
            raise LexosException("The input must be a spaCy token.")
        if not isinstance(regex, bool):
            raise LexosException("The regex flag must be a boolean.")
        if filters and filters != {}:
            for filter, value in filters.items():
                if (
                    regex
                    and re.search(re.compile(value), getattr(token, filter)) is not None
                ):
                    return False
                elif getattr(token, filter) == value:
                    return False
                else:
                    return True
        else:
            return True

    def _validate_filters(self, filters: List[str]) -> None:
        """Ensure that filters are in the correct format.

        Args:
            filters (Union[List[Dict[str, str]], List[str])): A list of filter dictionaries with keys
            or a list of boolean filters (the names of spaCy token attributes).

        Returns:
            None

        Raises:
            LexosException: If the format for the filter is not correct.
        """
        message = "The filter must be a list of filter dictionaries with keys or a list of boolean filters (the names of spaCy token attributes)"
        if not isinstance(filters, list) or any(
            not isinstance(x, (dict, str)) for x in filters
        ):
            raise LexosException(message)

__init__(doc) ¤

Initialize a LexosDoc object.

Source code in lexos\tokenizer\lexosdoc.py
25
26
27
28
29
30
def __init__(self, doc: object):
    """Initialize a LexosDoc object."""
    if isinstance(doc, spacy.tokens.doc.Doc):
        self.doc = doc
    else:
        raise LexosException("The input must be a spaCy doc.")

get_term_counts(limit=None, start=0, end=None, filters=None, regex=False, normalize=False, normalize_with_filters=False, as_df=False) ¤

Get a list of word counts for each token in the doc.

Parameters:

Name Type Description Default
self object

A spaCy doc.

required
limit int

The maximum number of tokens to count.

None
start Any

The index of the first token to count.

0
end Any

The index of the last token to count after limit is applied.

None
filters List[Union[Dict[str, str], str]]

A list of Doc attributes to ignore.

None
regex bool

Whether to match the dictionary value using regex.

False
normalize bool

Whether to return raw counts or relative frequencies.

False
normalize_with_filters bool

Whether to normalize based on the number of tokens after filters are applied.

False
as_df bool

Whether to return a pandas dataframe.

False

Returns:

Type Description
Union[List, pd.DataFrame]

Union[List, pd.DataFrame]: A list of word count tuples for

Union[List, pd.DataFrame]

each token in the doc. Alternatively, a pandas dataframe.

Source code in lexos\tokenizer\lexosdoc.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def get_term_counts(
    self,
    limit: int = None,
    start: Any = 0,
    end: Any = None,
    filters: List[Union[Dict[str, str], str]] = None,
    regex: bool = False,
    normalize: bool = False,
    normalize_with_filters: bool = False,
    as_df=False,
) -> Union[List, pd.DataFrame]:
    """Get a list of word counts for each token in the doc.

    Args:
        self (object): A spaCy doc.
        limit (int): The maximum number of tokens to count.
        start (Any): The index of the first token to count.
        end (Any): The index of the last token to count after limit is applied.
        filters (List[Union[Dict[str, str], str]]): A list of Doc attributes to ignore.
        regex (bool): Whether to match the dictionary value using regex.
        normalize (bool): Whether to return raw counts or relative frequencies.
        normalize_with_filters (bool): Whether to normalize based on the number
            of tokens after filters are applied.
        as_df (bool): Whether to return a pandas dataframe.

    Returns:
        Union[List, pd.DataFrame]: A list of word count tuples for
        each token in the doc. Alternatively, a pandas dataframe.
    """
    tokens = []
    bool_filters = []
    dict_filters = {}
    if filters:
        self._validate_filters(filters)
        for filter in filters:
            if isinstance(filter, dict):
                dict_filters[list(filter.keys())[0]] = list(filter.values())[0]
            else:
                bool_filters.append(filter)
    tokens = [
        token.text
        for token in self.doc
        if self._bool_filter(token, bool_filters)
        and self._dict_filter(token, dict_filters, regex=regex)
    ]
    term_counts = Counter(tokens).most_common(limit)[start:end]
    columns = ["term", "count"]
    if normalize_with_filters:
        normalize = True
        num_tokens = len(tokens)
    else:
        num_tokens = len(self.doc)
    if normalize:
        term_counts = [(x[0], x[1] / num_tokens) for x in term_counts]
        columns[1] = "frequency"
    if as_df:
        return self._dataframe(term_counts, columns)
    else:
        return term_counts

get_token_attrs() ¤

Get a list of attributes for each token in the doc.

Returns a dict with "spacy_attributes" and "extensions".

Note: This function relies on sampling the first token in a doc to compile the list of attributes. It does not check for consistency. Currently, it is up to the user to reconcile inconsistencies between docs.

Source code in lexos\tokenizer\lexosdoc.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def get_token_attrs(self):
    """Get a list of attributes for each token in the doc.

    Returns a dict with "spacy_attributes" and "extensions".

    Note: This function relies on sampling the first token in a doc
    to compile the list of attributes. It does not check for consistency.
    Currently, it is up to the user to reconcile inconsistencies between
    docs.
    """
    sample = self.doc[0]
    attrs = sorted([x for x in dir(sample) if not x.startswith("__") and x != "_"])
    exts = sorted(
        [f"_{x}" for x in dir(sample._) if x not in ["get", "has", "set"]]
    )
    return {"spacy_attributes": attrs, "extensions": exts}

get_tokens() ¤

Return a list of tokens in the doc.

Source code in lexos\tokenizer\lexosdoc.py
92
93
94
def get_tokens(self):
    """Return a list of tokens in the doc."""
    return [token.text for token in self.doc]

to_dataframe(cols=['text'], show_ranges=True) ¤

Get a pandas dataframe of the doc attributes.

Parameters:

Name Type Description Default
cols List[str]

A list of columns to include in the dataframe.

['text']
show_ranges bool

Whether to include the token start and end positions in the dataframe.

True

Returns a pandas dataframe of the doc attributes.

Note: It is a good idea to call LexosDoc.get_token_attrs() first to check which attributes are available for the doc.

Source code in lexos\tokenizer\lexosdoc.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def to_dataframe(
    self, cols: List[str] = ["text"], show_ranges: bool = True
) -> pd.DataFrame:
    """Get a pandas dataframe of the doc attributes.

    Args:
        cols: A list of columns to include in the dataframe.
        show_ranges: Whether to include the token start and end positions in the dataframe.

    Returns a pandas dataframe of the doc attributes.

    Note: It is a good idea to call `LexosDoc.get_token_attrs()` first
    to check which attributes are available for the doc.
    """
    rows = []
    for i, token in enumerate(self.doc):
        t = []
        for col in cols:
            t.append(getattr(token, col))
        if show_ranges:
            ranges = self.doc.to_json()["tokens"][i]
            t.append(ranges["start"])
            t.append(ranges["end"])
        rows.append(t)
    if show_ranges:
        cols = cols + ["start", "end"]
    return self._dataframe(rows, cols)