Skip to content

DTM¤

The DTM module contains a basic DTM class.

lexos.dtm.DTM ¤

Class for a document-term matrix.

Source code in lexos\dtm\__init__.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
class DTM:
    """Class for a document-term matrix."""

    def __init__(
        self,
        docs=List[Union[List[str], spacy.tokens.doc.Doc]],
        labels: List[str] = None,
    ) -> None:
        """Initialise the DTM.

        Args:
            docs (List[Union[List[str], spacy.tokens.doc.Doc]]): A list of spaCy docs or a list of token lists.
            labels (List[str]): A list of labels for the documents.

        Returns:
            None
        """
        self.docs = docs
        self.table = None
        if not labels:
            self.labels = ["doc" + str(i) for i in range(len(docs))]
        else:
            self.labels = labels
        self.vectorizer_settings = {}
        self.vectorizer = self.set_vectorizer(new=True)
        self.build()

    def build(self):
        """Build a new DTM matrix based on the current vectorizer."""
        doc_token_lists = DtmData(docs=self.docs).docs
        self.matrix = self.vectorizer.fit_transform(doc_token_lists)
        # Require explicit calling of get_table after each build to ensure table is up to date.
        # Ensures that the two processes can be kept separate if desired.
        self.table = None

    def get_table(self, transpose: bool = False) -> pd.DataFrame:
        """Get a Textacy document-term matrix as a pandas dataframe.

        Args:
            transpose (bool): If True, terms are columns and docs are rows.

        Returns:
                pd.Dataframe
        """
        if self.table is not None:
            return self.table
        else:
            rows = []
            for term in self.vectorizer.terms_list:
                row = [term]
                terms = self.vectorizer.vocabulary_terms[term]
                freq = self.matrix[0:, terms].toarray()
                [row.append(item[0]) for item in freq]
                rows.append(row)
            df = pd.DataFrame(rows, columns=["terms"] + self.labels)
            if transpose:
                df.rename({"terms": "docs"}, axis=1, inplace=True)
                df = df.T
            self.table = df
            return df

    def get_freq_table(
        self, rounding: int = 3, as_percent: bool = False
    ) -> pd.DataFrame:
        """Get a table with the relative frequencies of terms in each document.

        Args:
            rounding (int): The number of digits to round floats.
            as_percent (bool): Whether to return the frequencies as percentages.

        Returns:
            pd.DataFrame: A dataframe with the relative frequencies.
        """
        df = self.get_table().copy()
        df.set_index("terms", inplace=True)
        if as_percent:
            return df.apply(
                lambda row: ((row / row.sum()) * 100).round(rounding), axis=1
            ).reset_index()
        else:
            return df.apply(
                lambda row: row / row.sum().round(rounding), axis=1
            ).reset_index()

    def get_stats_table(
        self, stats: Union[List[str], str] = "sum", rounding: int = 3
    ) -> pd.DataFrame:
        """Get a table with the sum, mean, and/or median calculated for each row.

        Args:
            stats (Union[List[str], str]): One or more of "sum", "mean", and/or "median".
            rounding (int): The number of digits to round floats.

        Returns:
            pd.DataFrame: A dataframe with the calculated statistics.
        """
        df = self.get_table()
        tmp = df.copy()
        if "sum" in stats:
            tmp["sum"] = df.sum(axis=1, numeric_only=True)
        if "mean" in stats:
            tmp["mean"] = df.mean(axis=1, numeric_only=True).round(rounding)
        if "median" in stats:
            median = df.median(axis=1, numeric_only=True)
            tmp["median"] = median.round(rounding)
        return tmp

    def get_terms(self):
        """Get an alphabetical list of terms."""
        return self.vectorizer.vocabulary_terms

    def get_term_counts(
        self,
        sort_by: Union[list, List[str]] = ["terms", "sum"],
        ascending: Union[bool, List[bool]] = True,
        alg=SORTING_ALGORITHM,
    ) -> List[tuple]:
        """Get a list of term counts with optional sorting.

        Args:
            sort_by Union[list, List[str]]): The column(s) to sort by in order of preference.
            ascending (Union[bool, List[bool]]): Whether to sort values in ascending or descending order.

        Returns:
            List(tuple): A list of tuples containing terms and counts.
        """
        if alg != SORTING_ALGORITHM:
            self._validate_sorting_algorithm(alg)
        df = self.get_stats_table("sum").sort_values(
            by=sort_by, ascending=ascending, key=alg
        )
        terms = df["terms"].values.tolist()
        sums = df["sum"].values.tolist()
        return [(terms[i], sums[i]) for i, _ in enumerate(terms)]

    def least_frequent(self, max_n_terms: int = 100, start: int = 0) -> pd.DataFrame:
        """Get the most frequent terms in the DTM.

        Args:
            max_n_terms (int): The number of terms to return.
            start: int = 0: The start index in the DTM table.

        Returns:
            pd.DataFrame: The reduced DTM table.

        Note: This function should not be used if `min_df` or `max_df` is set in
        the vectorizer because the table will be cut twice.
        """
        df = self.get_stats_table("sum").sort_values(by="sum", ascending=True)
        df = df[start:]
        return df.tail(max_n_terms)

    def most_frequent(self, max_n_terms: int = 100, start: int = 0) -> pd.DataFrame:
        """Get the most frequent terms in the DTM.

        Args:
            max_n_terms (int): The number of terms to return.
            start: int = 0: The start index in the DTM table.

        Returns:
            pd.DataFrame: The reduced DTM table.

        Note: This function should not be used if `min_df` or `max_df` is set in
        the vectorizer because the table will be cut twice.
        """
        df = self.get_stats_table("sum").sort_values(by="sum", ascending=False)
        return df[start:max_n_terms]

    def set_vectorizer(
        self,
        tf_type: str = "linear",
        idf_type: str = None,
        dl_type: str = None,
        norm: Union[list, str] = None,
        min_df: Union[float, int] = 1,
        max_df: Union[float, int] = 1.0,
        max_n_terms: int = None,
        vocabulary_terms: Union[list, str] = None,
        new: bool = False,
    ):
        """Set the vectorizer.

        By default, returns a vectorizer that gets raw counts.
        """
        from textacy.representations.vectorizers import Vectorizer

        vectorizer = Vectorizer(
            tf_type=tf_type,
            idf_type=idf_type,
            dl_type=dl_type,
            norm=norm,
            min_df=min_df,
            max_df=max_df,
            max_n_terms=max_n_terms,
            vocabulary_terms=vocabulary_terms,
        )
        self.vectorizer_settings = {
            "tf_type": tf_type,
            "idf_type": idf_type,
            "norm": norm,
            "min_df": min_df,
            "max_df": max_df,
            "max_n_terms": max_n_terms,
        }
        if new:
            return vectorizer
        else:
            self.vectorizer = vectorizer

    def _validate_sorting_algorithm(self, alg: Any) -> bool:
        """Ensure that the specified sorting algorithm is a valid natsort locale.

        Args:
            alg: The sorting algorithm to validate.

        Returns:
            bool: Whether the sorting algorithm is valid.
        """
        if alg not in [e for e in ns]:
            locales = ", ".join([f"ns.{e.name}" for e in ns])
            err = (
                f"Invalid sorting algorithm: {alg}.",
                f"Valid algorithms for `alg` are: {locales}.",
                "See https://natsort.readthedocs.io/en/stable/api.html#natsort.ns.",
            )
            raise LexosException(" ".join(err))
        return True

__init__(docs=List[Union[List[str], spacy.tokens.doc.Doc]], labels=None) ¤

Initialise the DTM.

Parameters:

Name Type Description Default
docs List[Union[List[str], spacy.tokens.doc.Doc]]

A list of spaCy docs or a list of token lists.

List[Union[List[str], spacy.tokens.doc.Doc]]
labels List[str]

A list of labels for the documents.

None

Returns:

Type Description
None

None

Source code in lexos\dtm\__init__.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def __init__(
    self,
    docs=List[Union[List[str], spacy.tokens.doc.Doc]],
    labels: List[str] = None,
) -> None:
    """Initialise the DTM.

    Args:
        docs (List[Union[List[str], spacy.tokens.doc.Doc]]): A list of spaCy docs or a list of token lists.
        labels (List[str]): A list of labels for the documents.

    Returns:
        None
    """
    self.docs = docs
    self.table = None
    if not labels:
        self.labels = ["doc" + str(i) for i in range(len(docs))]
    else:
        self.labels = labels
    self.vectorizer_settings = {}
    self.vectorizer = self.set_vectorizer(new=True)
    self.build()

build() ¤

Build a new DTM matrix based on the current vectorizer.

Source code in lexos\dtm\__init__.py
73
74
75
76
77
78
79
def build(self):
    """Build a new DTM matrix based on the current vectorizer."""
    doc_token_lists = DtmData(docs=self.docs).docs
    self.matrix = self.vectorizer.fit_transform(doc_token_lists)
    # Require explicit calling of get_table after each build to ensure table is up to date.
    # Ensures that the two processes can be kept separate if desired.
    self.table = None

get_freq_table(rounding=3, as_percent=False) ¤

Get a table with the relative frequencies of terms in each document.

Parameters:

Name Type Description Default
rounding int

The number of digits to round floats.

3
as_percent bool

Whether to return the frequencies as percentages.

False

Returns:

Type Description
pd.DataFrame

pd.DataFrame: A dataframe with the relative frequencies.

Source code in lexos\dtm\__init__.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def get_freq_table(
    self, rounding: int = 3, as_percent: bool = False
) -> pd.DataFrame:
    """Get a table with the relative frequencies of terms in each document.

    Args:
        rounding (int): The number of digits to round floats.
        as_percent (bool): Whether to return the frequencies as percentages.

    Returns:
        pd.DataFrame: A dataframe with the relative frequencies.
    """
    df = self.get_table().copy()
    df.set_index("terms", inplace=True)
    if as_percent:
        return df.apply(
            lambda row: ((row / row.sum()) * 100).round(rounding), axis=1
        ).reset_index()
    else:
        return df.apply(
            lambda row: row / row.sum().round(rounding), axis=1
        ).reset_index()

get_stats_table(stats='sum', rounding=3) ¤

Get a table with the sum, mean, and/or median calculated for each row.

Parameters:

Name Type Description Default
stats Union[List[str], str]

One or more of "sum", "mean", and/or "median".

'sum'
rounding int

The number of digits to round floats.

3

Returns:

Type Description
pd.DataFrame

pd.DataFrame: A dataframe with the calculated statistics.

Source code in lexos\dtm\__init__.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def get_stats_table(
    self, stats: Union[List[str], str] = "sum", rounding: int = 3
) -> pd.DataFrame:
    """Get a table with the sum, mean, and/or median calculated for each row.

    Args:
        stats (Union[List[str], str]): One or more of "sum", "mean", and/or "median".
        rounding (int): The number of digits to round floats.

    Returns:
        pd.DataFrame: A dataframe with the calculated statistics.
    """
    df = self.get_table()
    tmp = df.copy()
    if "sum" in stats:
        tmp["sum"] = df.sum(axis=1, numeric_only=True)
    if "mean" in stats:
        tmp["mean"] = df.mean(axis=1, numeric_only=True).round(rounding)
    if "median" in stats:
        median = df.median(axis=1, numeric_only=True)
        tmp["median"] = median.round(rounding)
    return tmp

get_table(transpose=False) ¤

Get a Textacy document-term matrix as a pandas dataframe.

Parameters:

Name Type Description Default
transpose bool

If True, terms are columns and docs are rows.

False

Returns:

Type Description
pd.DataFrame

pd.Dataframe

Source code in lexos\dtm\__init__.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def get_table(self, transpose: bool = False) -> pd.DataFrame:
    """Get a Textacy document-term matrix as a pandas dataframe.

    Args:
        transpose (bool): If True, terms are columns and docs are rows.

    Returns:
            pd.Dataframe
    """
    if self.table is not None:
        return self.table
    else:
        rows = []
        for term in self.vectorizer.terms_list:
            row = [term]
            terms = self.vectorizer.vocabulary_terms[term]
            freq = self.matrix[0:, terms].toarray()
            [row.append(item[0]) for item in freq]
            rows.append(row)
        df = pd.DataFrame(rows, columns=["terms"] + self.labels)
        if transpose:
            df.rename({"terms": "docs"}, axis=1, inplace=True)
            df = df.T
        self.table = df
        return df

get_term_counts(sort_by=['terms', 'sum'], ascending=True, alg=SORTING_ALGORITHM) ¤

Get a list of term counts with optional sorting.

Parameters:

Name Type Description Default
sort_by Union[list, List[str]]

The column(s) to sort by in order of preference.

['terms', 'sum']
ascending Union[bool, List[bool]]

Whether to sort values in ascending or descending order.

True

Returns:

Name Type Description
List tuple

A list of tuples containing terms and counts.

Source code in lexos\dtm\__init__.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def get_term_counts(
    self,
    sort_by: Union[list, List[str]] = ["terms", "sum"],
    ascending: Union[bool, List[bool]] = True,
    alg=SORTING_ALGORITHM,
) -> List[tuple]:
    """Get a list of term counts with optional sorting.

    Args:
        sort_by Union[list, List[str]]): The column(s) to sort by in order of preference.
        ascending (Union[bool, List[bool]]): Whether to sort values in ascending or descending order.

    Returns:
        List(tuple): A list of tuples containing terms and counts.
    """
    if alg != SORTING_ALGORITHM:
        self._validate_sorting_algorithm(alg)
    df = self.get_stats_table("sum").sort_values(
        by=sort_by, ascending=ascending, key=alg
    )
    terms = df["terms"].values.tolist()
    sums = df["sum"].values.tolist()
    return [(terms[i], sums[i]) for i, _ in enumerate(terms)]

get_terms() ¤

Get an alphabetical list of terms.

Source code in lexos\dtm\__init__.py
153
154
155
def get_terms(self):
    """Get an alphabetical list of terms."""
    return self.vectorizer.vocabulary_terms

least_frequent(max_n_terms=100, start=0) ¤

Get the most frequent terms in the DTM.

Parameters:

Name Type Description Default
max_n_terms int

The number of terms to return.

100
start int

int = 0: The start index in the DTM table.

0

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The reduced DTM table.

the vectorizer because the table will be cut twice.

Source code in lexos\dtm\__init__.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def least_frequent(self, max_n_terms: int = 100, start: int = 0) -> pd.DataFrame:
    """Get the most frequent terms in the DTM.

    Args:
        max_n_terms (int): The number of terms to return.
        start: int = 0: The start index in the DTM table.

    Returns:
        pd.DataFrame: The reduced DTM table.

    Note: This function should not be used if `min_df` or `max_df` is set in
    the vectorizer because the table will be cut twice.
    """
    df = self.get_stats_table("sum").sort_values(by="sum", ascending=True)
    df = df[start:]
    return df.tail(max_n_terms)

most_frequent(max_n_terms=100, start=0) ¤

Get the most frequent terms in the DTM.

Parameters:

Name Type Description Default
max_n_terms int

The number of terms to return.

100
start int

int = 0: The start index in the DTM table.

0

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The reduced DTM table.

the vectorizer because the table will be cut twice.

Source code in lexos\dtm\__init__.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def most_frequent(self, max_n_terms: int = 100, start: int = 0) -> pd.DataFrame:
    """Get the most frequent terms in the DTM.

    Args:
        max_n_terms (int): The number of terms to return.
        start: int = 0: The start index in the DTM table.

    Returns:
        pd.DataFrame: The reduced DTM table.

    Note: This function should not be used if `min_df` or `max_df` is set in
    the vectorizer because the table will be cut twice.
    """
    df = self.get_stats_table("sum").sort_values(by="sum", ascending=False)
    return df[start:max_n_terms]

set_vectorizer(tf_type='linear', idf_type=None, dl_type=None, norm=None, min_df=1, max_df=1.0, max_n_terms=None, vocabulary_terms=None, new=False) ¤

Set the vectorizer.

By default, returns a vectorizer that gets raw counts.

Source code in lexos\dtm\__init__.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def set_vectorizer(
    self,
    tf_type: str = "linear",
    idf_type: str = None,
    dl_type: str = None,
    norm: Union[list, str] = None,
    min_df: Union[float, int] = 1,
    max_df: Union[float, int] = 1.0,
    max_n_terms: int = None,
    vocabulary_terms: Union[list, str] = None,
    new: bool = False,
):
    """Set the vectorizer.

    By default, returns a vectorizer that gets raw counts.
    """
    from textacy.representations.vectorizers import Vectorizer

    vectorizer = Vectorizer(
        tf_type=tf_type,
        idf_type=idf_type,
        dl_type=dl_type,
        norm=norm,
        min_df=min_df,
        max_df=max_df,
        max_n_terms=max_n_terms,
        vocabulary_terms=vocabulary_terms,
    )
    self.vectorizer_settings = {
        "tf_type": tf_type,
        "idf_type": idf_type,
        "norm": norm,
        "min_df": min_df,
        "max_df": max_df,
        "max_n_terms": max_n_terms,
    }
    if new:
        return vectorizer
    else:
        self.vectorizer = vectorizer

lexos.dtm.DtmData ¤

Bases: BaseModel

DtmData class.

This model validates the input data for the DTM and, if necessary, coerces it to a list of token lists.

Source code in lexos\dtm\__init__.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class DtmData(BaseModel):
    """DtmData class.

    This model validates the input data for the DTM and, if necessary,
    coerces it to a list of token lists.
    """

    docs: List[Union[List[str], spacy.tokens.doc.Doc]]

    class Config:
        """Config class."""

        arbitrary_types_allowed = True

    @validator("docs", pre=True, always=True)
    def ensure_token_lists(cls, v):
        """Coerces input to a list of token lists where each token is a string."""
        tokens = []
        for doc in v:
            if isinstance(doc, spacy.tokens.doc.Doc):
                tokens.append([token.text for token in doc])
            elif isinstance(doc, list):
                if all(isinstance(sub, str) for sub in doc):
                    tokens.append(doc)
                else:
                    raise LexosException("Each list item must be a string.")
            else:
                raise LexosException("Could not parse the document list.")
        return tokens

Config ¤

Config class.

Source code in lexos\dtm\__init__.py
24
25
26
27
class Config:
    """Config class."""

    arbitrary_types_allowed = True

ensure_token_lists(v) ¤

Coerces input to a list of token lists where each token is a string.

Source code in lexos\dtm\__init__.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
@validator("docs", pre=True, always=True)
def ensure_token_lists(cls, v):
    """Coerces input to a list of token lists where each token is a string."""
    tokens = []
    for doc in v:
        if isinstance(doc, spacy.tokens.doc.Doc):
            tokens.append([token.text for token in doc])
        elif isinstance(doc, list):
            if all(isinstance(sub, str) for sub in doc):
                tokens.append(doc)
            else:
                raise LexosException("Each list item must be a string.")
        else:
            raise LexosException("Could not parse the document list.")
    return tokens