Skip to content

Topic Model¤

The topic_model module is used to train and visualize topic models. Currently, it works MALLET, which must be installed separately, to train models and generates visualizations with dfr-browser.

lexos.topic_model.mallet.Mallet ¤

A wrapper for the MALLET command line tool.

Source code in lexos\topic_model\mallet\__init__.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
class Mallet:
    """A wrapper for the MALLET command line tool."""

    def __init__(self, model_dir: str, mallet_path: str = "mallet"):
        """Initialize the MALLET object.

        Args:
            model_dir (str): The directory to store the model.
            mallet_path (str): The path to the MALLET binary.
        """
        self.model_dir = model_dir
        self.mallet_path = mallet_path

    def import_data(self,
        docs: List[object],
        allowed: List[str] = None,
        remove_stops: bool = True,
        remove_punct: bool = True,
        use_lemmas: bool = False,
        **kwargs):
        """Import data into MALLET.

        Args:
            docs (List[object]): A list of spaCy documents.
            allowed (List[str]): A list of POS tags that are allowed.
            remove_stops (bool): Whether to remove stop words.
            remove_punct (bool): Whether to remove punctuation.
            use_lemmas (bool): Whether to replace tokens with lemmas.

        Notes:
            Creates a file containing one doc per line with each doc
            consisting of space-separated terms repeated however many
            times they occurred in the source doc. This file is then
            over-written by the MALLET import-file command, potentially
            using any MALLET command flags that are passed in (although
            most of the work is done by the first step in the process).
        """
        msg = Printer()
        if not Path(f"{self.model_dir}/data_skip.txt").is_file():
            msg.text("Bagifying data...")
            # Set the allowable tokens
            if allowed:
                is_allowed_getter = lambda token: token.pos_ in allowed
                Token.set_extension("is_allowed", getter=is_allowed_getter, force=True)
            else:
                Token.set_extension("is_allowed", default=True, force=True)
            bags = []
            # Get the token text for each doc
            for doc in docs:
                if use_lemmas:
                    tokens = [
                        token.lemma_ for token in doc
                        if token._.is_allowed
                        and token.is_stop != remove_stops
                        and token.is_punct != remove_punct
                    ]
                else:
                    tokens = [
                        token.text for token in doc
                        if token._.is_allowed
                        and token.is_stop != remove_stops
                        and token.is_punct != remove_punct
                    ]
                # Get the token counts
                counts = dict(Counter(tokens))
                # Create a bag with copies of each token occurring multiple times
                bag = []
                for k, v in counts.items():
                    repeated = f"{k} " * v
                    bag.append(repeated.strip())
                bags.append(" ".join(bag))
            # Write the data file with a bag for each document
            self.data_file = f"{self.model_dir}/data.txt"
            with open(self.data_file, "w", encoding="utf-8") as f:
                f.write("\n".join(bags))
        else:
            self.data_file = f"{self.model_dir}/data.txt"
        self.mallet_file = f"{self.model_dir}/import.mallet"
        # Build the MALLET import command
        opts = {
            "keep-sequence": True,
            "preserve-case": True,
            "remove-stopwords": False,
            "extra-stopwords": False,
            "token-regex": '"\S+"',
            "stoplist-file": None,
            }
        opts.update(kwargs)
        cmd_opts = []
        for k, v in opts.items():
            if v is not None:
                if v == True:
                    cmd_opts.append(f"--{k}")
                elif isinstance(v, str):
                    cmd_opts.append(f"--{k} {v}")
        mallet_cmd = f"{self.mallet_path}/mallet import-file --input {self.data_file} --output {self.mallet_file} "
        mallet_cmd += " ".join(cmd_opts)
        msg.text(f"Running {mallet_cmd}")
        mallet_cmd = shlex.split(mallet_cmd)
        # Perform the import
        try:
            # shell=True required to handle backslashes in token-regex
            output = check_output(mallet_cmd, stderr=STDOUT, shell=True, universal_newlines=True)
            msg.good("Import complete.")
        except CalledProcessError as e:
            output = e.output#.decode()
            msg.fail(output)

    def train(self,
                mallet_file: str = None,
                num_topics: int = 20,
                num_iterations: int = 1000,
                optimize_interval: int = 10,
                random_seed: int = None,
                **kwargs):
        """Train a model.

        Args:
            num_topics (int): The number of topics to train.
            num_iterations (int): The number of iterations to train.
            optimize_interval (int): The number of iterations between optimization.
            random_seed (int): The random seed to use.
        """
        msg = Printer()
        # Set the options
        try:
            if not mallet_file:
                mallet_file = self.mallet_file
        except AttributeError:
            msg.fail("Please supply an `input` argument with the path to your MALLET import file.")
        opts = {
            "input": mallet_file,
            "num-topics": str(num_topics),
            "num-iterations": str(num_iterations),
            "optimize-interval": str(optimize_interval),
            "random-seed": random_seed,
            "output-state": f"{self.model_dir}/state.gz",
            "output-topic-keys": f"{self.model_dir}/keys.txt",
            "output-doc-topics": f"{self.model_dir}/composition.txt",
            "word-topic-counts-file": f"{self.model_dir}/counts.txt",
            "output-topic-docs": f"{self.model_dir}/topic-docs.txt",
            "diagnostics-file": f"{self.model_dir}/diagnostics.xml"
        }
        opts.update(kwargs)
        cmd_opts = []
        for k, v in opts.items():
            if v is not None:
                if k == "random-seed":
                    v = str(v)
                if v == True:
                    cmd_opts.append(f"--{k}")
                elif isinstance(v, str):
                    cmd_opts.append(f"--{k} {v}")
        cmd_opts = " ".join(cmd_opts)
        mallet_cmd = f"{self.mallet_path}/mallet train-topics {cmd_opts}"
        msg.text(f"Running {mallet_cmd}\n")
        p = Popen(mallet_cmd, stdout=PIPE, stderr=STDOUT, shell=True)
        ll = []
        prog = re.compile(u'\<([^\)]+)\>')
        while p.poll() is None:
            l = p.stdout.readline().decode()
            print(l, end='')
            # Keep track of LL/topic.
            try:
                this_ll = float(re.findall('([-+]\d+\.\d+)', l)[0])
                ll.append(this_ll)
            except IndexError:  # Not every line will match.
                pass
            # Keep track of modeling progress
            try:
                this_iter = float(prog.match(l).groups()[0])
                progress = int(100. * this_iter/num_iterations)
                if progress % 10 == 0:
                    print('Modeling progress: {0}%.\r'.format(progress)),
            except AttributeError:  # Not every line will match.
                pass

    def scale(self, model_state_file: str = None, output_file: str = None):
        """Scale a model.

        Args:
            model_state_file (str): The path to a state_file.
            output_file (str): The path to an output file.
        """
        msg = Printer()
        msg.text("Processing...")
        if not model_state_file:
            model_state_file = f"{self.model_dir}/state.gz"
        if not output_file:
            output_file = f"{self.model_dir}/topic_scaled.csv"
        # try:
        # Convert the mallet output_state file to a pyLDAvis data object
        converted_data = scale_model.convert_mallet_data(model_state_file)
        # Get the topic coordinates in a dataframe
        topic_coordinates = scale_model.get_topic_coordinates(**converted_data)
        # Save the topic coordinates to a CSV file
        topic_coordinates.to_csv(output_file, index=False, header=False)
        msg.good("Done!")

__init__(model_dir, mallet_path='mallet') ¤

Initialize the MALLET object.

Parameters:

Name Type Description Default
model_dir str

The directory to store the model.

required
mallet_path str

The path to the MALLET binary.

'mallet'
Source code in lexos\topic_model\mallet\__init__.py
19
20
21
22
23
24
25
26
27
def __init__(self, model_dir: str, mallet_path: str = "mallet"):
    """Initialize the MALLET object.

    Args:
        model_dir (str): The directory to store the model.
        mallet_path (str): The path to the MALLET binary.
    """
    self.model_dir = model_dir
    self.mallet_path = mallet_path

import_data(docs, allowed=None, remove_stops=True, remove_punct=True, use_lemmas=False, **kwargs) ¤

Import data into MALLET.

Parameters:

Name Type Description Default
docs List[object]

A list of spaCy documents.

required
allowed List[str]

A list of POS tags that are allowed.

None
remove_stops bool

Whether to remove stop words.

True
remove_punct bool

Whether to remove punctuation.

True
use_lemmas bool

Whether to replace tokens with lemmas.

False
Notes

Creates a file containing one doc per line with each doc consisting of space-separated terms repeated however many times they occurred in the source doc. This file is then over-written by the MALLET import-file command, potentially using any MALLET command flags that are passed in (although most of the work is done by the first step in the process).

Source code in lexos\topic_model\mallet\__init__.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def import_data(self,
    docs: List[object],
    allowed: List[str] = None,
    remove_stops: bool = True,
    remove_punct: bool = True,
    use_lemmas: bool = False,
    **kwargs):
    """Import data into MALLET.

    Args:
        docs (List[object]): A list of spaCy documents.
        allowed (List[str]): A list of POS tags that are allowed.
        remove_stops (bool): Whether to remove stop words.
        remove_punct (bool): Whether to remove punctuation.
        use_lemmas (bool): Whether to replace tokens with lemmas.

    Notes:
        Creates a file containing one doc per line with each doc
        consisting of space-separated terms repeated however many
        times they occurred in the source doc. This file is then
        over-written by the MALLET import-file command, potentially
        using any MALLET command flags that are passed in (although
        most of the work is done by the first step in the process).
    """
    msg = Printer()
    if not Path(f"{self.model_dir}/data_skip.txt").is_file():
        msg.text("Bagifying data...")
        # Set the allowable tokens
        if allowed:
            is_allowed_getter = lambda token: token.pos_ in allowed
            Token.set_extension("is_allowed", getter=is_allowed_getter, force=True)
        else:
            Token.set_extension("is_allowed", default=True, force=True)
        bags = []
        # Get the token text for each doc
        for doc in docs:
            if use_lemmas:
                tokens = [
                    token.lemma_ for token in doc
                    if token._.is_allowed
                    and token.is_stop != remove_stops
                    and token.is_punct != remove_punct
                ]
            else:
                tokens = [
                    token.text for token in doc
                    if token._.is_allowed
                    and token.is_stop != remove_stops
                    and token.is_punct != remove_punct
                ]
            # Get the token counts
            counts = dict(Counter(tokens))
            # Create a bag with copies of each token occurring multiple times
            bag = []
            for k, v in counts.items():
                repeated = f"{k} " * v
                bag.append(repeated.strip())
            bags.append(" ".join(bag))
        # Write the data file with a bag for each document
        self.data_file = f"{self.model_dir}/data.txt"
        with open(self.data_file, "w", encoding="utf-8") as f:
            f.write("\n".join(bags))
    else:
        self.data_file = f"{self.model_dir}/data.txt"
    self.mallet_file = f"{self.model_dir}/import.mallet"
    # Build the MALLET import command
    opts = {
        "keep-sequence": True,
        "preserve-case": True,
        "remove-stopwords": False,
        "extra-stopwords": False,
        "token-regex": '"\S+"',
        "stoplist-file": None,
        }
    opts.update(kwargs)
    cmd_opts = []
    for k, v in opts.items():
        if v is not None:
            if v == True:
                cmd_opts.append(f"--{k}")
            elif isinstance(v, str):
                cmd_opts.append(f"--{k} {v}")
    mallet_cmd = f"{self.mallet_path}/mallet import-file --input {self.data_file} --output {self.mallet_file} "
    mallet_cmd += " ".join(cmd_opts)
    msg.text(f"Running {mallet_cmd}")
    mallet_cmd = shlex.split(mallet_cmd)
    # Perform the import
    try:
        # shell=True required to handle backslashes in token-regex
        output = check_output(mallet_cmd, stderr=STDOUT, shell=True, universal_newlines=True)
        msg.good("Import complete.")
    except CalledProcessError as e:
        output = e.output#.decode()
        msg.fail(output)

scale(model_state_file=None, output_file=None) ¤

Scale a model.

Parameters:

Name Type Description Default
model_state_file str

The path to a state_file.

None
output_file str

The path to an output file.

None
Source code in lexos\topic_model\mallet\__init__.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def scale(self, model_state_file: str = None, output_file: str = None):
    """Scale a model.

    Args:
        model_state_file (str): The path to a state_file.
        output_file (str): The path to an output file.
    """
    msg = Printer()
    msg.text("Processing...")
    if not model_state_file:
        model_state_file = f"{self.model_dir}/state.gz"
    if not output_file:
        output_file = f"{self.model_dir}/topic_scaled.csv"
    # try:
    # Convert the mallet output_state file to a pyLDAvis data object
    converted_data = scale_model.convert_mallet_data(model_state_file)
    # Get the topic coordinates in a dataframe
    topic_coordinates = scale_model.get_topic_coordinates(**converted_data)
    # Save the topic coordinates to a CSV file
    topic_coordinates.to_csv(output_file, index=False, header=False)
    msg.good("Done!")

train(mallet_file=None, num_topics=20, num_iterations=1000, optimize_interval=10, random_seed=None, **kwargs) ¤

Train a model.

Parameters:

Name Type Description Default
num_topics int

The number of topics to train.

20
num_iterations int

The number of iterations to train.

1000
optimize_interval int

The number of iterations between optimization.

10
random_seed int

The random seed to use.

None
Source code in lexos\topic_model\mallet\__init__.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def train(self,
            mallet_file: str = None,
            num_topics: int = 20,
            num_iterations: int = 1000,
            optimize_interval: int = 10,
            random_seed: int = None,
            **kwargs):
    """Train a model.

    Args:
        num_topics (int): The number of topics to train.
        num_iterations (int): The number of iterations to train.
        optimize_interval (int): The number of iterations between optimization.
        random_seed (int): The random seed to use.
    """
    msg = Printer()
    # Set the options
    try:
        if not mallet_file:
            mallet_file = self.mallet_file
    except AttributeError:
        msg.fail("Please supply an `input` argument with the path to your MALLET import file.")
    opts = {
        "input": mallet_file,
        "num-topics": str(num_topics),
        "num-iterations": str(num_iterations),
        "optimize-interval": str(optimize_interval),
        "random-seed": random_seed,
        "output-state": f"{self.model_dir}/state.gz",
        "output-topic-keys": f"{self.model_dir}/keys.txt",
        "output-doc-topics": f"{self.model_dir}/composition.txt",
        "word-topic-counts-file": f"{self.model_dir}/counts.txt",
        "output-topic-docs": f"{self.model_dir}/topic-docs.txt",
        "diagnostics-file": f"{self.model_dir}/diagnostics.xml"
    }
    opts.update(kwargs)
    cmd_opts = []
    for k, v in opts.items():
        if v is not None:
            if k == "random-seed":
                v = str(v)
            if v == True:
                cmd_opts.append(f"--{k}")
            elif isinstance(v, str):
                cmd_opts.append(f"--{k} {v}")
    cmd_opts = " ".join(cmd_opts)
    mallet_cmd = f"{self.mallet_path}/mallet train-topics {cmd_opts}"
    msg.text(f"Running {mallet_cmd}\n")
    p = Popen(mallet_cmd, stdout=PIPE, stderr=STDOUT, shell=True)
    ll = []
    prog = re.compile(u'\<([^\)]+)\>')
    while p.poll() is None:
        l = p.stdout.readline().decode()
        print(l, end='')
        # Keep track of LL/topic.
        try:
            this_ll = float(re.findall('([-+]\d+\.\d+)', l)[0])
            ll.append(this_ll)
        except IndexError:  # Not every line will match.
            pass
        # Keep track of modeling progress
        try:
            this_iter = float(prog.match(l).groups()[0])
            progress = int(100. * this_iter/num_iterations)
            if progress % 10 == 0:
                print('Modeling progress: {0}%.\r'.format(progress)),
        except AttributeError:  # Not every line will match.
            pass

lexos.topic_model.mallet.scale_model.__num_dist_rows__(array, ndigits=2) ¤

Check that all rows in a matrix sum to 1.

Source code in lexos\topic_model\mallet\scale_model.py
22
23
24
def __num_dist_rows__(array, ndigits: int = 2):
    """Check that all rows in a matrix sum to 1."""
    return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum())

lexos.topic_model.mallet.scale_model.ValidationError ¤

Bases: ValueError

Handle validation errors.

Source code in lexos\topic_model\mallet\scale_model.py
27
28
29
30
class ValidationError(ValueError):
    """Handle validation errors."""

    pass

lexos.topic_model.mallet.scale_model._input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) ¤

Check input for scale_model.

Parameters:

Name Type Description Default
topic_term_dists pd.DataFrame

Matrix of topic-term probabilities.

required
doc_topic_dists pd.DataFrame

Matrix of document-topic probabilities.

required
doc_lengths list

List of document lengths.

required
vocab list

List of vocabulary.

required
term_frequency int

Minimum number of times a term must appear in a document.

required

Returns:

Name Type Description
list list

List of errors.

Source code in lexos\topic_model\mallet\scale_model.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def _input_check(
    topic_term_dists: pd.DataFrame,
    doc_topic_dists: pd.DataFrame,
    doc_lengths: list,
    vocab: list,
    term_frequency: int,
) -> list:
    """Check input for scale_model.

    Args:
        topic_term_dists (pd.DataFrame): Matrix of topic-term probabilities.
        doc_topic_dists (pd.DataFrame): Matrix of document-topic probabilities.
        doc_lengths (list): List of document lengths.
        vocab (list): List of vocabulary.
        term_frequency (int): Minimum number of times a term must appear in a document.

    Returns:
        list: List of errors.
    """
    ttds = topic_term_dists.shape
    dtds = doc_topic_dists.shape
    errors = []

    def err(msg):
        """Append error message."""
        errors.append(msg)

    if dtds[1] != ttds[0]:
        err(
            "Number of rows of topic_term_dists does not match number of columns of doc_topic_dists; both should be equal to the number of topics in the model."
        )

    if len(doc_lengths) != dtds[0]:
        err(
            "Length of doc_lengths not equal to the number of rows in doc_topic_dists; both should be equal to the number of documents in the data."
        )

    W = len(vocab)
    if ttds[1] != W:
        err(
            "Number of terms in vocabulary does not match the number of columns of topic_term_dists (where each row of topic_term_dists is a probability distribution of terms for a given topic)."
        )
    if len(term_frequency) != W:
        err(
            "Length of term_frequency not equal to the number of terms in the vocabulary (len of vocab)."
        )

    if __num_dist_rows__(topic_term_dists) != ttds[0]:
        err("Not all rows (distributions) in topic_term_dists sum to 1.")

    if __num_dist_rows__(doc_topic_dists) != dtds[0]:
        err("Not all rows (distributions) in doc_topic_dists sum to 1.")

    if len(errors) > 0:
        return errors

lexos.topic_model.mallet.scale_model._input_validate(*args) ¤

Check input for scale_model.

Source code in lexos\topic_model\mallet\scale_model.py
90
91
92
93
94
def _input_validate(*args) -> None:
    """Check input for scale_model."""
    res = _input_check(*args)
    if res:
        raise ValidationError("\n" + "\n".join([" * " + s for s in res]))

lexos.topic_model.mallet.scale_model._jensen_shannon(_P, _Q) ¤

Calculate Jensen-Shannon Divergence.

Parameters:

Name Type Description Default
_P np.array

Probability distribution.

required
_Q np.array

Probability distribution.

required

Returns:

Name Type Description
float float

Jensen-Shannon Divergence.

Source code in lexos\topic_model\mallet\scale_model.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
def _jensen_shannon(_P: np.array, _Q: np.array) -> float:
    """Calculate Jensen-Shannon Divergence.

    Args:
        _P (np.array): Probability distribution.
        _Q (np.array): Probability distribution.

    Returns:
        float: Jensen-Shannon Divergence.
    """
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

lexos.topic_model.mallet.scale_model._pcoa(pair_dists, n_components=2) ¤

Perform Principal Coordinate Analysis.

AKA Classical Multidimensional Scaling Code referenced from skbio.stats.ordination.pcoa

Parameters:

Name Type Description Default
pair_dists np.array

Pairwise distances.

required
n_components int

Number of dimensions to reduce to.

2

Returns:

Type Description
np.array

np.array: PCoA matrix.

Source code in lexos\topic_model\mallet\scale_model.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def _pcoa(pair_dists: np.array, n_components: int = 2) -> np.array:
    """Perform Principal Coordinate Analysis.

    AKA Classical Multidimensional Scaling
    Code referenced from [skbio.stats.ordination.pcoa](https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py)

    Args:
        pair_dists (np.array): Pairwise distances.
        n_components (int): Number of dimensions to reduce to.

    Returns:
        np.array: PCoA matrix.
    """
    # pairwise distance matrix is assumed symmetric
    pair_dists = np.asarray(pair_dists, np.float64)

    # perform SVD on double centred distance matrix
    n = pair_dists.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    B = -H.dot(pair_dists ** 2).dot(H) / 2
    eigvals, eigvecs = np.linalg.eig(B)

    # Take first n_components of eigenvalues and eigenvectors
    # sorted in decreasing order
    ix = eigvals.argsort()[::-1][:n_components]
    eigvals = eigvals[ix]
    eigvecs = eigvecs[:, ix]

    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
    # at least 1 eigenvalue must be zero
    eigvals[np.isclose(eigvals, 0)] = 0
    if np.any(eigvals < 0):
        ix_neg = eigvals < 0
        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

    return np.sqrt(eigvals) * eigvecs

lexos.topic_model.mallet.scale_model.js_PCoA(distributions) ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)

Parameters:

Name Type Description Default
distributions np.array

(array-like, shape (n_dists, k)): Matrix of distributions probabilities.

required

Returns:

Name Type Description
pcoa np.array

(array, shape (n_dists, 2))

Source code in lexos\topic_model\mallet\scale_model.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def js_PCoA(distributions: np.array) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Principal Coordinate Analysis
    (aka Classical Multidimensional Scaling)

    Args:
        distributions: (array-like, shape (`n_dists`, `k`)): Matrix of distributions probabilities.

    Returns:
        pcoa (np.array): (array, shape (`n_dists`, 2))

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    return _pcoa(dist_matrix)

lexos.topic_model.mallet.scale_model.js_MMDS(distributions, **kwargs) ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & Metric Multidimensional Scaling

Parameters:

Name Type Description Default
distributions np.array

Matrix of distributions probabilities (array-like, shape (n_dists, k)).

required
**kwargs dict

Keyword argument to be passed to sklearn.manifold.MDS()

{}

Returns:

Name Type Description
mmds np.array

(array, shape (n_dists, 2))

Source code in lexos\topic_model\mallet\scale_model.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def js_MMDS(distributions: np.array, **kwargs) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Metric Multidimensional Scaling

    Args:
        distributions (np.array): Matrix of distributions probabilities (array-like, shape (`n_dists`, `k`)).
        **kwargs (dict): Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns:
        mmds (np.array): (array, shape (`n_dists`, 2))

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity="precomputed", **kwargs)
    return model.fit_transform(dist_matrix)

lexos.topic_model.mallet.scale_model.js_TSNE(distributions, **kwargs) ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

Parameters:

Name Type Description Default
distributions np.array

Matrix of distributions probabilities (array-like, shape (n_dists, k)).

required
**kwargs dict

Keyword argument to be passed to sklearn.manifold.MDS()

{}

Returns:

Name Type Description
tsne np.array

(array, shape (n_dists, 2))

Source code in lexos\topic_model\mallet\scale_model.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def js_TSNE(distributions, **kwargs) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

    Args:
        distributions (np.array): Matrix of distributions probabilities  (array-like, shape (`n_dists`, `k`)).
        **kwargs (dict): Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns:
        tsne (np.array): (array, shape (`n_dists`, 2))
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = TSNE(n_components=2, random_state=0, metric="precomputed", **kwargs)
    return model.fit_transform(dist_matrix)

lexos.topic_model.mallet.scale_model._df_with_names(data, index_name, columns_name) ¤

Get a dataframe with names.

Parameters:

Name Type Description Default
data pd.DataFrame

Dataframe.

required
index_name str

Name of index.

required
columns_name str

Name of columns.

required

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Dataframe with names.

Source code in lexos\topic_model\mallet\scale_model.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def _df_with_names(data, index_name: str, columns_name: str) -> pd.DataFrame:
    """Get a dataframe with names.

    Args:
        data (pd.DataFrame): Dataframe.
        index_name (str): Name of index.
        columns_name (str): Name of columns.

    Returns:
        pd.DataFrame: Dataframe with names.
    """
    if isinstance(data, pd.DataFrame):
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

lexos.topic_model.mallet.scale_model._series_with_name(data, name) ¤

Get a series with name.

Parameters:

Name Type Description Default
data pd.Series

Series.

required
name str

Name of series.

required

Returns:

Type Description
pd.Series

pd.Series: Series with name.

Source code in lexos\topic_model\mallet\scale_model.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
def _series_with_name(data, name) -> pd.Series:
    """Get a series with name.

    Args:
        data (pd.Series): Series.
        name (str): Name of series.

    Returns:
        pd.Series: Series with name.
    """
    if isinstance(data, pd.Series):
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

lexos.topic_model.mallet.scale_model._topic_coordinates(mds, topic_term_dists, topic_proportion) ¤

Get coordinates for topics.

Parameters:

Name Type Description Default
mds array, shape (`n_dists`, 2

MDS coordinates.

required
topic_term_dists array, shape (`n_topics`, `n_terms`

Topic-term distributions.

required
topic_proportion array, shape (`n_topics`

Topic proportions.

required

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Topic coordinates.

Source code in lexos\topic_model\mallet\scale_model.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def _topic_coordinates(
    mds: np.array, topic_term_dists: np.array, topic_proportion: np.array
) -> pd.DataFrame:
    """Get coordinates for topics.

    Args:
        mds (array, shape (`n_dists`, 2)): MDS coordinates.
        topic_term_dists (array, shape (`n_topics`, `n_terms`)): Topic-term distributions.
        topic_proportion (array, shape (`n_topics`)): Topic proportions.

    Returns:
        pd.DataFrame: Topic coordinates.
    """
    K = topic_term_dists.shape[0]
    mds_res = mds(topic_term_dists)
    assert mds_res.shape == (K, 2)
    mds_df = pd.DataFrame(
        {
            "x": mds_res[:, 0],
            "y": mds_res[:, 1],
            "topics": range(1, K + 1),
            "cluster": 1,
            "Freq": topic_proportion * 100,
        }
    )
    # note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
    return mds_df

lexos.topic_model.mallet.scale_model.get_topic_coordinates(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, mds=js_PCoA, sort_topics=True) ¤

Transform the topic model distributions and related corpus.

Creates the data structures needed for topic bubbles.

Parameters:

Name Type Description Default
topic_term_dists array-like, shape (`n_topics`, `n_terms`

Matrix of topic-term probabilities where n_terms is len(vocab).

required
doc_topic_dists array-like, shape (`n_docs`, `n_topics`

Matrix of document-topic probabilities.

required
doc_lengths

(array-like, shape n_docs): The length of each document, i.e. the number of words in each document. The order of the numbers should be consistent with the ordering of the docs in doc_topic_dists.

required
vocab array-like, shape `n_terms`

List of all the words in the corpus used to train the model.

required
term_frequency array-like, shape `n_terms`

The count of each particular term over the entire corpus. The ordering of these counts should correspond with vocab and topic_term_dists.

required
mds Callable

A function that takes topic_term_dists as an input and outputs a n_topics by 2 distance matrix. The output approximates the distance between topics. See js_PCoA() for details on the default function. A string representation currently accepts pcoa (or upper case variant), mmds (or upper case variant) and tsne (or upper case variant), if sklearn package is installed for the latter two.

js_PCoA
sort_topics bool

Whether to sort topics by topic proportion (percentage of tokens covered). Set to False to to keep original topic order.

True

Returns:

Name Type Description
scaled_coordinates pd.DataFrame

A pandas dataframe containing scaled x and y coordinates.

Source code in lexos\topic_model\mallet\scale_model.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def get_topic_coordinates(
    topic_term_dists: np.array,
    doc_topic_dists: np.array,
    doc_lengths: list,
    vocab: list,
    term_frequency: list,
    mds: Callable = js_PCoA,
    sort_topics: bool = True,
) -> pd.DataFrame:
    """Transform the topic model distributions and related corpus.

    Creates the data structures needed for topic bubbles.

    Args:
        topic_term_dists (array-like, shape (`n_topics`, `n_terms`)): Matrix of topic-term probabilities where
            `n_terms` is `len(vocab)`.
        doc_topic_dists (array-like, shape (`n_docs`, `n_topics`)): Matrix of document-topic probabilities.
        doc_lengths : (array-like, shape `n_docs`): The length of each document, i.e. the number of words
            in each document. The order of the numbers should be consistent with the ordering of the docs in
            `doc_topic_dists`.
        vocab (array-like, shape `n_terms`): List of all the words in the corpus used to train the model.
        term_frequency (array-like, shape `n_terms`): The count of each particular term over the entire corpus.
            The ordering of these counts should correspond with `vocab` and `topic_term_dists`.
        mds (Callable): A function that takes `topic_term_dists` as an input and outputs a `n_topics` by `2`
            distance matrix. The output approximates the distance between topics. See `js_PCoA()` for details
            on the default function. A string representation currently accepts `pcoa` (or upper case variant),
            `mmds` (or upper case variant) and `tsne` (or upper case variant), if `sklearn` package is installed
            for the latter two.
        sort_topics (bool): Whether to sort topics by topic proportion (percentage of tokens covered). Set to
            `False` to to keep original topic order.

    Returns:
        scaled_coordinates (pd.DataFrame): A pandas dataframe containing scaled x and y coordinates.
    """
    # parse mds
    # if isinstance(mds, basestring):
    if isinstance(mds, (str, bytes)):
        mds = mds.lower()
        if mds == "pcoa":
            mds = js_PCoA
        elif mds in ("mmds", "tsne"):
            if sklearn_present:
                mds_opts = {"mmds": js_MMDS, "tsne": js_TSNE}
                mds = mds_opts[mds]
            else:
                logging.warning("sklearn not present, switch to PCoA")
                mds = js_PCoA
        else:
            logging.warning("Unknown mds `%s`, switch to PCoA" % mds)
            mds = js_PCoA

    topic_term_dists = _df_with_names(topic_term_dists, "topic", "term")
    doc_topic_dists = _df_with_names(doc_topic_dists, "doc", "topic")
    term_frequency = _series_with_name(term_frequency, "term_frequency")
    doc_lengths = _series_with_name(doc_lengths, "doc_length")
    vocab = _series_with_name(vocab, "vocab")
    _input_validate(
        topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency
    )

    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    if sort_topics:
        topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    else:
        topic_proportion = topic_freq / topic_freq.sum()

    topic_order = topic_proportion.index
    topic_term_dists = topic_term_dists.iloc[topic_order]

    scaled_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)

    return scaled_coordinates

lexos.topic_model.mallet.scale_model.extract_params(statefile) ¤

Extract the alpha and beta values from the statefile.

Parameters:

Name Type Description Default
statefile str

Path to statefile produced by MALLET.

required

Returns:

Name Type Description
tuple tuple

A tuple of (alpha (list), beta)

Source code in lexos\topic_model\mallet\scale_model.py
344
345
346
347
348
349
350
351
352
353
354
355
def extract_params(statefile: str) -> tuple:
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.

    Returns:
        tuple: A tuple of (alpha (list), beta)
    """
    with gzip.open(statefile, "r") as state:
        params = [x.decode("utf8").strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))

lexos.topic_model.mallet.scale_model.state_to_df(statefile) ¤

Transform state file into pandas dataframe.

The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.

Parameters:

Name Type Description Default
statefile str

Path to statefile produced by MALLET.

required

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The topic assignment for each token in each document of the model.

Source code in lexos\topic_model\mallet\scale_model.py
358
359
360
361
362
363
364
365
366
367
368
369
def state_to_df(statefile: str) -> pd.DataFrame:
    """Transform state file into pandas dataframe.

    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.

    Args:
        statefile (str): Path to statefile produced by MALLET.

    Returns:
        pd.DataFrame: The topic assignment for each token in each document of the model.
    """
    return pd.read_csv(statefile, compression="gzip", sep=" ", skiprows=[1, 2])

lexos.topic_model.mallet.scale_model.pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable) ¤

Turn the pandas dataframe into a data matrix.

Parameters:

Name Type Description Default
df pd.DataFrame

The aggregated dataframe.

required
smooth_value float

Value to add to the matrix to account for the priors.

required
rows_variable str

The name of the dataframe column to use as the rows in the matrix.

required
cols_variable str

The name of the dataframe column to use as the columns in the matrix.

required
values_variable str

The name of the dataframe column to use as the values in the matrix.

required

Returns:

Type Description
pd.DataFrame

pd.DataFrame: A pandas matrix that has been normalized on the rows.

Source code in lexos\topic_model\mallet\scale_model.py
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
def pivot_and_smooth(
    df: pd.DataFrame,
    smooth_value: float,
    rows_variable: str,
    cols_variable: str,
    values_variable: str,
) -> pd.DataFrame:
    """Turn the pandas dataframe into a data matrix.

    Args:
        df (pd.DataFrame): The aggregated dataframe.
        smooth_value (float): Value to add to the matrix to account for the priors.
        rows_variable (str): The name of the dataframe column to use as the rows in the matrix.
        cols_variable (str): The name of the dataframe column to use as the columns in the matrix.
        values_variable (str): The name of the dataframe column to use as the values in the matrix.

    Returns:
        pd.DataFrame: A pandas matrix that has been normalized on the rows.
    """
    matrix = df.pivot(
        index=rows_variable, columns=cols_variable, values=values_variable
    ).fillna(value=0)
    matrix = matrix.values + smooth_value

    normed = sklearn.preprocessing.normalize(matrix, norm="l1", axis=1)

    return pd.DataFrame(normed)

lexos.topic_model.mallet.scale_model.convert_mallet_data(state_file) ¤

Convert Mallet data to a structure compatible with pyLDAvis.

Parameters:

Name Type Description Default
state_file string

Mallet state file

required

Returns:

Name Type Description
data dict

A dict containing pandas dataframes for the pyLDAvis prepare method.

Source code in lexos\topic_model\mallet\scale_model.py
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
def convert_mallet_data(state_file: str) -> dict:
    """Convert Mallet data to a structure compatible with pyLDAvis.

    Args:
        state_file (string): Mallet state file

    Returns:
        data (dict): A dict containing pandas dataframes for the pyLDAvis prepare method.
    """
    params = extract_params(state_file)
    alpha = [float(x) for x in params[0][1:]]
    beta = params[1]
    df = state_to_df(state_file)
    # Ensure that NaN is a string
    df["type"] = df.type.astype(str)
    # Get document lengths from statefile
    docs = df.groupby("#doc")["type"].count().reset_index(name="doc_length")
    # Get vocab and term frequencies from statefile
    vocab = df["type"].value_counts().reset_index()
    vocab.columns = ["type", "term_freq"]
    vocab = vocab.sort_values(by="type", ascending=True)
    phi_df = (
        df.groupby(["topic", "type"])["type"].count().reset_index(name="token_count")
    )
    phi_df = phi_df.sort_values(by="type", ascending=True)
    phi = pivot_and_smooth(phi_df, beta, "topic", "type", "token_count")
    theta_df = (
        df.groupby(["#doc", "topic"])["topic"].count().reset_index(name="topic_count")
    )
    theta = pivot_and_smooth(theta_df, alpha, "#doc", "topic", "topic_count")
    data = {
        "topic_term_dists": phi,
        "doc_topic_dists": theta,
        "doc_lengths": list(docs["doc_length"]),
        "vocab": list(vocab["type"]),
        "term_frequency": list(vocab["term_freq"]),
    }
    return data

lexos.topic_model.dfr_browser.DfrBrowser ¤

DfrBrowser class.

Source code in lexos\topic_model\dfr_browser\__init__.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class DfrBrowser:
    """DfrBrowser class."""

    def __init__(
        self,
        model_dir: str = ".",
        model_state_file: str = "state.gz",
        model_scaled_file: str = "topic_scaled.csv",
        template_dir: str = TEMPLATE_DIR,
    ) -> None:
        """Initialize DfrBrowser object."""
        self.template_dir = template_dir
        self.model_dir = model_dir
        self.model_state_file = f"{model_dir}/{model_state_file}"
        self.model_scaled_file = f"{model_dir}/{model_scaled_file}"
        self.browser_dir = f"{model_dir}/dfr_browser"
        self.data_dir = f"{self.browser_dir}/data"
        self.num_topics = None  # How to get this?

        # Make a browser directory and copy the template into it
        if not Path(self.browser_dir).exists():
            self._copy_template()

        # Create dfr-browser files using python script
        self._prepare_data()

        # Copy scaled file into data dir
        shutil.copy(self.model_scaled_file, self.data_dir)

        # Move meta.csv to data_dir, zip up, and rename, delete meta.csv copy
        self._move_metadata()

        # Update assets
        self._update_assets()

    def _copy_template(self):
        """Copy the template directory to the browser directory."""
        try:
            shutil.copytree(Path(self.template_dir), Path(self.browser_dir))
        except FileNotFoundError as e:
            raise LexosException(f"Could not find dfr-browser template: {e}")

    def _prepare_data(self):
        """Prepare the data for the dfr-browser visualization."""
        Path(f"{self.data_dir}").mkdir(parents=True, exist_ok=True)
        prepare_data_script = f"python {self.browser_dir}/bin/prepare-data"
        cmd = " ".join(
            [
                prepare_data_script,
                "convert-state",
                self.model_state_file,
                "--tw",
                f"{self.data_dir}/tw.json",
                "--dt",
                f"{self.data_dir}/dt.json.zip",
            ]
        )
        cmd = shlex.split(cmd)
        try:
            output = check_output(
                cmd, stderr=STDOUT, shell=True, universal_newlines=True
            )
            print(output)
        except CalledProcessError as e:
            raise LexosException(e.output)
        cmd = " ".join(
            [prepare_data_script, "info-stub", "-o", f"{self.data_dir}/info.json"]
        )
        cmd = shlex.split(cmd)
        try:
            output = check_output(
                cmd, stderr=STDOUT, shell=True, universal_newlines=True
            )
            print(output)
        except CalledProcessError as e:
            raise LexosException(e.output)

    def _move_metadata(self):
        """Move meta.csv to data_dir, zip up, rename, and delete meta.csv copy."""
        meta_zip = f"{self.data_dir}/meta.csv.zip"
        if Path(meta_zip).exists():
            Path(meta_zip).unlink()
        browser_meta_file = f"{self.model_dir}/meta.csv"
        shutil.copy(browser_meta_file, self.data_dir)
        try:
            shutil.make_archive(
                f"{self.data_dir}/meta.csv", "zip", self.data_dir, "meta.csv"
            )
        except OSError as err:
            raise LexosException(f"Error writing meta.csv.zip: {err}")

    def _update_assets(self):
        """Update browser assets."""
        # Tweak default index.html to link to JSON, not JSTOR
        with open(f"{self.browser_dir}/index.html", "r") as f:
            filedata = f.read().replace("on JSTOR", "JSON")
        with open(f"{self.browser_dir}/index.html", "w") as f:
            f.write(filedata)
        # Tweak js file to link to the domain
        with open(
            f"{self.browser_dir}/js/dfb.min.js.custom", "r", encoding="utf-8"
        ) as f:
            filedata = f.read()
        pat = r"t\.select\(\"#doc_remark a\.url\"\).attr\(\"href\", .+?\);"
        new_pat = r'var doc_url = document.URL.split("modules")[0] + "project_data"; t.select("#doc_remark a.url")'
        new_pat += r'.attr("href", doc_url + "/" + e.url);'
        filedata = re.sub(pat, new_pat, filedata)
        with open(f"{self.browser_dir}/js/dfb.min.js", "w", encoding="utf-8") as f:
            f.write(filedata)

    def run(self, port: int = 8080) -> None:
        """Run the dfr-browser.

        This might work on the Jupyter port, but it might not.
        """
        # run_server = f"python {self.browser_dir}/bin/server"
        import os
        import sys
        import threading
        import time
        import webbrowser as w
        from http.server import HTTPServer, SimpleHTTPRequestHandler

        # set up the HTTP server and start it in a separate daemon thread
        httpd = HTTPServer(("localhost", port), SimpleHTTPRequestHandler)
        thread = threading.Thread(target=httpd.serve_forever)
        thread.daemon = True

        # if startup time is too long we might want to be able to quit the program
        current_dir = os.getcwd()
        try:
            os.chdir(self.browser_dir)
            thread.start()
        except KeyboardInterrupt:
            httpd.shutdown()
            os.chdir(current_dir)
            sys.exit(0)

        # wait until the webserver finished starting up (maybe wait longer or shorter...)
        time.sleep(3)

        # start sending requests
        w.open(f"http://127.0.0.1:{port}/")

__init__(model_dir='.', model_state_file='state.gz', model_scaled_file='topic_scaled.csv', template_dir=TEMPLATE_DIR) ¤

Initialize DfrBrowser object.

Source code in lexos\topic_model\dfr_browser\__init__.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    model_dir: str = ".",
    model_state_file: str = "state.gz",
    model_scaled_file: str = "topic_scaled.csv",
    template_dir: str = TEMPLATE_DIR,
) -> None:
    """Initialize DfrBrowser object."""
    self.template_dir = template_dir
    self.model_dir = model_dir
    self.model_state_file = f"{model_dir}/{model_state_file}"
    self.model_scaled_file = f"{model_dir}/{model_scaled_file}"
    self.browser_dir = f"{model_dir}/dfr_browser"
    self.data_dir = f"{self.browser_dir}/data"
    self.num_topics = None  # How to get this?

    # Make a browser directory and copy the template into it
    if not Path(self.browser_dir).exists():
        self._copy_template()

    # Create dfr-browser files using python script
    self._prepare_data()

    # Copy scaled file into data dir
    shutil.copy(self.model_scaled_file, self.data_dir)

    # Move meta.csv to data_dir, zip up, and rename, delete meta.csv copy
    self._move_metadata()

    # Update assets
    self._update_assets()

run(port=8080) ¤

Run the dfr-browser.

This might work on the Jupyter port, but it might not.

Source code in lexos\topic_model\dfr_browser\__init__.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def run(self, port: int = 8080) -> None:
    """Run the dfr-browser.

    This might work on the Jupyter port, but it might not.
    """
    # run_server = f"python {self.browser_dir}/bin/server"
    import os
    import sys
    import threading
    import time
    import webbrowser as w
    from http.server import HTTPServer, SimpleHTTPRequestHandler

    # set up the HTTP server and start it in a separate daemon thread
    httpd = HTTPServer(("localhost", port), SimpleHTTPRequestHandler)
    thread = threading.Thread(target=httpd.serve_forever)
    thread.daemon = True

    # if startup time is too long we might want to be able to quit the program
    current_dir = os.getcwd()
    try:
        os.chdir(self.browser_dir)
        thread.start()
    except KeyboardInterrupt:
        httpd.shutdown()
        os.chdir(current_dir)
        sys.exit(0)

    # wait until the webserver finished starting up (maybe wait longer or shorter...)
    time.sleep(3)

    # start sending requests
    w.open(f"http://127.0.0.1:{port}/")