Topic Model¤

The topic_model module is used to train and visualize topic models. Currently, it works MALLET, which must be installed separately, to train models and generates visualizations with dfr-browser.

`lexos.topic_model.mallet.Mallet` ¤

A wrapper for the MALLET command line tool.

Source code in lexos\topic_model\mallet\__init__.py

class Mallet:
    """A wrapper for the MALLET command line tool."""

    def __init__(self, model_dir: str, mallet_path: str = "mallet"):
        """Initialize the MALLET object.

        Args:
            model_dir (str): The directory to store the model.
            mallet_path (str): The path to the MALLET binary.
        """
        self.model_dir = model_dir
        self.mallet_path = mallet_path

    def import_data(self,
        docs: List[object],
        allowed: List[str] = None,
        remove_stops: bool = True,
        remove_punct: bool = True,
        use_lemmas: bool = False,
        **kwargs):
        """Import data into MALLET.

        Args:
            docs (List[object]): A list of spaCy documents.
            allowed (List[str]): A list of POS tags that are allowed.
            remove_stops (bool): Whether to remove stop words.
            remove_punct (bool): Whether to remove punctuation.
            use_lemmas (bool): Whether to replace tokens with lemmas.

        Notes:
            Creates a file containing one doc per line with each doc
            consisting of space-separated terms repeated however many
            times they occurred in the source doc. This file is then
            over-written by the MALLET import-file command, potentially
            using any MALLET command flags that are passed in (although
            most of the work is done by the first step in the process).
        """
        msg = Printer()
        if not Path(f"{self.model_dir}/data_skip.txt").is_file():
            msg.text("Bagifying data...")
            # Set the allowable tokens
            if allowed:
                is_allowed_getter = lambda token: token.pos_ in allowed
                Token.set_extension("is_allowed", getter=is_allowed_getter, force=True)
            else:
                Token.set_extension("is_allowed", default=True, force=True)
            bags = []
            # Get the token text for each doc
            for doc in docs:
                if use_lemmas:
                    tokens = [
                        token.lemma_ for token in doc
                        if token._.is_allowed
                        and token.is_stop != remove_stops
                        and token.is_punct != remove_punct
                    ]
                else:
                    tokens = [
                        token.text for token in doc
                        if token._.is_allowed
                        and token.is_stop != remove_stops
                        and token.is_punct != remove_punct
                    ]
                # Get the token counts
                counts = dict(Counter(tokens))
                # Create a bag with copies of each token occurring multiple times
                bag = []
                for k, v in counts.items():
                    repeated = f"{k} " * v
                    bag.append(repeated.strip())
                bags.append(" ".join(bag))
            # Write the data file with a bag for each document
            self.data_file = f"{self.model_dir}/data.txt"
            with open(self.data_file, "w", encoding="utf-8") as f:
                f.write("\n".join(bags))
        else:
            self.data_file = f"{self.model_dir}/data.txt"
        self.mallet_file = f"{self.model_dir}/import.mallet"
        # Build the MALLET import command
        opts = {
            "keep-sequence": True,
            "preserve-case": True,
            "remove-stopwords": False,
            "extra-stopwords": False,
            "token-regex": '"\S+"',
            "stoplist-file": None,
            }
        opts.update(kwargs)
        cmd_opts = []
        for k, v in opts.items():
            if v is not None:
                if v == True:
                    cmd_opts.append(f"--{k}")
                elif isinstance(v, str):
                    cmd_opts.append(f"--{k} {v}")
        mallet_cmd = f"{self.mallet_path}/mallet import-file --input {self.data_file} --output {self.mallet_file} "
        mallet_cmd += " ".join(cmd_opts)
        msg.text(f"Running {mallet_cmd}")
        mallet_cmd = shlex.split(mallet_cmd)
        # Perform the import
        try:
            # shell=True required to handle backslashes in token-regex
            output = check_output(mallet_cmd, stderr=STDOUT, shell=True, universal_newlines=True)
            msg.good("Import complete.")
        except CalledProcessError as e:
            output = e.output#.decode()
            msg.fail(output)

    def train(self,
                mallet_file: str = None,
                num_topics: int = 20,
                num_iterations: int = 1000,
                optimize_interval: int = 10,
                random_seed: int = None,
                **kwargs):
        """Train a model.

        Args:
            num_topics (int): The number of topics to train.
            num_iterations (int): The number of iterations to train.
            optimize_interval (int): The number of iterations between optimization.
            random_seed (int): The random seed to use.
        """
        msg = Printer()
        # Set the options
        try:
            if not mallet_file:
                mallet_file = self.mallet_file
        except AttributeError:
            msg.fail("Please supply an `input` argument with the path to your MALLET import file.")
        opts = {
            "input": mallet_file,
            "num-topics": str(num_topics),
            "num-iterations": str(num_iterations),
            "optimize-interval": str(optimize_interval),
            "random-seed": random_seed,
            "output-state": f"{self.model_dir}/state.gz",
            "output-topic-keys": f"{self.model_dir}/keys.txt",
            "output-doc-topics": f"{self.model_dir}/composition.txt",
            "word-topic-counts-file": f"{self.model_dir}/counts.txt",
            "output-topic-docs": f"{self.model_dir}/topic-docs.txt",
            "diagnostics-file": f"{self.model_dir}/diagnostics.xml"
        }
        opts.update(kwargs)
        cmd_opts = []
        for k, v in opts.items():
            if v is not None:
                if k == "random-seed":
                    v = str(v)
                if v == True:
                    cmd_opts.append(f"--{k}")
                elif isinstance(v, str):
                    cmd_opts.append(f"--{k} {v}")
        cmd_opts = " ".join(cmd_opts)
        mallet_cmd = f"{self.mallet_path}/mallet train-topics {cmd_opts}"
        msg.text(f"Running {mallet_cmd}\n")
        p = Popen(mallet_cmd, stdout=PIPE, stderr=STDOUT, shell=True)
        ll = []
        prog = re.compile(u'\<([^\)]+)\>')
        while p.poll() is None:
            l = p.stdout.readline().decode()
            print(l, end='')
            # Keep track of LL/topic.
            try:
                this_ll = float(re.findall('([-+]\d+\.\d+)', l)[0])
                ll.append(this_ll)
            except IndexError:  # Not every line will match.
                pass
            # Keep track of modeling progress
            try:
                this_iter = float(prog.match(l).groups()[0])
                progress = int(100. * this_iter/num_iterations)
                if progress % 10 == 0:
                    print('Modeling progress: {0}%.\r'.format(progress)),
            except AttributeError:  # Not every line will match.
                pass

    def scale(self, model_state_file: str = None, output_file: str = None):
        """Scale a model.

        Args:
            model_state_file (str): The path to a state_file.
            output_file (str): The path to an output file.
        """
        msg = Printer()
        msg.text("Processing...")
        if not model_state_file:
            model_state_file = f"{self.model_dir}/state.gz"
        if not output_file:
            output_file = f"{self.model_dir}/topic_scaled.csv"
        # try:
        # Convert the mallet output_state file to a pyLDAvis data object
        converted_data = scale_model.convert_mallet_data(model_state_file)
        # Get the topic coordinates in a dataframe
        topic_coordinates = scale_model.get_topic_coordinates(**converted_data)
        # Save the topic coordinates to a CSV file
        topic_coordinates.to_csv(output_file, index=False, header=False)
        msg.good("Done!")

`init(model_dir, mallet_path='mallet')` ¤

Initialize the MALLET object.

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	The directory to store the model.	required
`mallet_path`	`str`	The path to the MALLET binary.	`'mallet'`

Source code in lexos\topic_model\mallet\__init__.py

def __init__(self, model_dir: str, mallet_path: str = "mallet"):
    """Initialize the MALLET object.

    Args:
        model_dir (str): The directory to store the model.
        mallet_path (str): The path to the MALLET binary.
    """
    self.model_dir = model_dir
    self.mallet_path = mallet_path

`import_data(docs, allowed=None, remove_stops=True, remove_punct=True, use_lemmas=False, **kwargs)` ¤

Import data into MALLET.

Parameters:

Name	Type	Description	Default
`docs`	`List[object]`	A list of spaCy documents.	required
`allowed`	`List[str]`	A list of POS tags that are allowed.	`None`
`remove_stops`	`bool`	Whether to remove stop words.	`True`
`remove_punct`	`bool`	Whether to remove punctuation.	`True`
`use_lemmas`	`bool`	Whether to replace tokens with lemmas.	`False`

Notes

Creates a file containing one doc per line with each doc consisting of space-separated terms repeated however many times they occurred in the source doc. This file is then over-written by the MALLET import-file command, potentially using any MALLET command flags that are passed in (although most of the work is done by the first step in the process).

Source code in lexos\topic_model\mallet\__init__.py

def import_data(self,
    docs: List[object],
    allowed: List[str] = None,
    remove_stops: bool = True,
    remove_punct: bool = True,
    use_lemmas: bool = False,
    **kwargs):
    """Import data into MALLET.

    Args:
        docs (List[object]): A list of spaCy documents.
        allowed (List[str]): A list of POS tags that are allowed.
        remove_stops (bool): Whether to remove stop words.
        remove_punct (bool): Whether to remove punctuation.
        use_lemmas (bool): Whether to replace tokens with lemmas.

    Notes:
        Creates a file containing one doc per line with each doc
        consisting of space-separated terms repeated however many
        times they occurred in the source doc. This file is then
        over-written by the MALLET import-file command, potentially
        using any MALLET command flags that are passed in (although
        most of the work is done by the first step in the process).
    """
    msg = Printer()
    if not Path(f"{self.model_dir}/data_skip.txt").is_file():
        msg.text("Bagifying data...")
        # Set the allowable tokens
        if allowed:
            is_allowed_getter = lambda token: token.pos_ in allowed
            Token.set_extension("is_allowed", getter=is_allowed_getter, force=True)
        else:
            Token.set_extension("is_allowed", default=True, force=True)
        bags = []
        # Get the token text for each doc
        for doc in docs:
            if use_lemmas:
                tokens = [
                    token.lemma_ for token in doc
                    if token._.is_allowed
                    and token.is_stop != remove_stops
                    and token.is_punct != remove_punct
                ]
            else:
                tokens = [
                    token.text for token in doc
                    if token._.is_allowed
                    and token.is_stop != remove_stops
                    and token.is_punct != remove_punct
                ]
            # Get the token counts
            counts = dict(Counter(tokens))
            # Create a bag with copies of each token occurring multiple times
            bag = []
            for k, v in counts.items():
                repeated = f"{k} " * v
                bag.append(repeated.strip())
            bags.append(" ".join(bag))
        # Write the data file with a bag for each document
        self.data_file = f"{self.model_dir}/data.txt"
        with open(self.data_file, "w", encoding="utf-8") as f:
            f.write("\n".join(bags))
    else:
        self.data_file = f"{self.model_dir}/data.txt"
    self.mallet_file = f"{self.model_dir}/import.mallet"
    # Build the MALLET import command
    opts = {
        "keep-sequence": True,
        "preserve-case": True,
        "remove-stopwords": False,
        "extra-stopwords": False,
        "token-regex": '"\S+"',
        "stoplist-file": None,
        }
    opts.update(kwargs)
    cmd_opts = []
    for k, v in opts.items():
        if v is not None:
            if v == True:
                cmd_opts.append(f"--{k}")
            elif isinstance(v, str):
                cmd_opts.append(f"--{k} {v}")
    mallet_cmd = f"{self.mallet_path}/mallet import-file --input {self.data_file} --output {self.mallet_file} "
    mallet_cmd += " ".join(cmd_opts)
    msg.text(f"Running {mallet_cmd}")
    mallet_cmd = shlex.split(mallet_cmd)
    # Perform the import
    try:
        # shell=True required to handle backslashes in token-regex
        output = check_output(mallet_cmd, stderr=STDOUT, shell=True, universal_newlines=True)
        msg.good("Import complete.")
    except CalledProcessError as e:
        output = e.output#.decode()
        msg.fail(output)

`scale(model_state_file=None, output_file=None)` ¤

Scale a model.

Parameters:

Name	Type	Description	Default
`model_state_file`	`str`	The path to a state_file.	`None`
`output_file`	`str`	The path to an output file.	`None`

Source code in lexos\topic_model\mallet\__init__.py

def scale(self, model_state_file: str = None, output_file: str = None):
    """Scale a model.

    Args:
        model_state_file (str): The path to a state_file.
        output_file (str): The path to an output file.
    """
    msg = Printer()
    msg.text("Processing...")
    if not model_state_file:
        model_state_file = f"{self.model_dir}/state.gz"
    if not output_file:
        output_file = f"{self.model_dir}/topic_scaled.csv"
    # try:
    # Convert the mallet output_state file to a pyLDAvis data object
    converted_data = scale_model.convert_mallet_data(model_state_file)
    # Get the topic coordinates in a dataframe
    topic_coordinates = scale_model.get_topic_coordinates(**converted_data)
    # Save the topic coordinates to a CSV file
    topic_coordinates.to_csv(output_file, index=False, header=False)
    msg.good("Done!")

`train(mallet_file=None, num_topics=20, num_iterations=1000, optimize_interval=10, random_seed=None, **kwargs)` ¤

Train a model.

Parameters:

Name	Type	Description	Default
`num_topics`	`int`	The number of topics to train.	`20`
`num_iterations`	`int`	The number of iterations to train.	`1000`
`optimize_interval`	`int`	The number of iterations between optimization.	`10`
`random_seed`	`int`	The random seed to use.	`None`

Source code in lexos\topic_model\mallet\__init__.py

def train(self,
            mallet_file: str = None,
            num_topics: int = 20,
            num_iterations: int = 1000,
            optimize_interval: int = 10,
            random_seed: int = None,
            **kwargs):
    """Train a model.

    Args:
        num_topics (int): The number of topics to train.
        num_iterations (int): The number of iterations to train.
        optimize_interval (int): The number of iterations between optimization.
        random_seed (int): The random seed to use.
    """
    msg = Printer()
    # Set the options
    try:
        if not mallet_file:
            mallet_file = self.mallet_file
    except AttributeError:
        msg.fail("Please supply an `input` argument with the path to your MALLET import file.")
    opts = {
        "input": mallet_file,
        "num-topics": str(num_topics),
        "num-iterations": str(num_iterations),
        "optimize-interval": str(optimize_interval),
        "random-seed": random_seed,
        "output-state": f"{self.model_dir}/state.gz",
        "output-topic-keys": f"{self.model_dir}/keys.txt",
        "output-doc-topics": f"{self.model_dir}/composition.txt",
        "word-topic-counts-file": f"{self.model_dir}/counts.txt",
        "output-topic-docs": f"{self.model_dir}/topic-docs.txt",
        "diagnostics-file": f"{self.model_dir}/diagnostics.xml"
    }
    opts.update(kwargs)
    cmd_opts = []
    for k, v in opts.items():
        if v is not None:
            if k == "random-seed":
                v = str(v)
            if v == True:
                cmd_opts.append(f"--{k}")
            elif isinstance(v, str):
                cmd_opts.append(f"--{k} {v}")
    cmd_opts = " ".join(cmd_opts)
    mallet_cmd = f"{self.mallet_path}/mallet train-topics {cmd_opts}"
    msg.text(f"Running {mallet_cmd}\n")
    p = Popen(mallet_cmd, stdout=PIPE, stderr=STDOUT, shell=True)
    ll = []
    prog = re.compile(u'\<([^\)]+)\>')
    while p.poll() is None:
        l = p.stdout.readline().decode()
        print(l, end='')
        # Keep track of LL/topic.
        try:
            this_ll = float(re.findall('([-+]\d+\.\d+)', l)[0])
            ll.append(this_ll)
        except IndexError:  # Not every line will match.
            pass
        # Keep track of modeling progress
        try:
            this_iter = float(prog.match(l).groups()[0])
            progress = int(100. * this_iter/num_iterations)
            if progress % 10 == 0:
                print('Modeling progress: {0}%.\r'.format(progress)),
        except AttributeError:  # Not every line will match.
            pass

`lexos.topic_model.mallet.scale_model.__num_dist_rows__(array, ndigits=2)` ¤

Check that all rows in a matrix sum to 1.

Source code in lexos\topic_model\mallet\scale_model.py

def __num_dist_rows__(array, ndigits: int = 2):
    """Check that all rows in a matrix sum to 1."""
    return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum())

`lexos.topic_model.mallet.scale_model.ValidationError` ¤

Bases: ValueError

Handle validation errors.

Source code in lexos\topic_model\mallet\scale_model.py

class ValidationError(ValueError):
    """Handle validation errors."""

    pass

`lexos.topic_model.mallet.scale_model._input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)` ¤

Check input for scale_model.

Parameters:

Name	Type	Description	Default
`topic_term_dists`	`pd.DataFrame`	Matrix of topic-term probabilities.	required
`doc_topic_dists`	`pd.DataFrame`	Matrix of document-topic probabilities.	required
`doc_lengths`	`list`	List of document lengths.	required
`vocab`	`list`	List of vocabulary.	required
`term_frequency`	`int`	Minimum number of times a term must appear in a document.	required

Returns:

Name	Type	Description
`list`	`list`	List of errors.

Source code in lexos\topic_model\mallet\scale_model.py

def _input_check(
    topic_term_dists: pd.DataFrame,
    doc_topic_dists: pd.DataFrame,
    doc_lengths: list,
    vocab: list,
    term_frequency: int,
) -> list:
    """Check input for scale_model.

    Args:
        topic_term_dists (pd.DataFrame): Matrix of topic-term probabilities.
        doc_topic_dists (pd.DataFrame): Matrix of document-topic probabilities.
        doc_lengths (list): List of document lengths.
        vocab (list): List of vocabulary.
        term_frequency (int): Minimum number of times a term must appear in a document.

    Returns:
        list: List of errors.
    """
    ttds = topic_term_dists.shape
    dtds = doc_topic_dists.shape
    errors = []

    def err(msg):
        """Append error message."""
        errors.append(msg)

    if dtds[1] != ttds[0]:
        err(
            "Number of rows of topic_term_dists does not match number of columns of doc_topic_dists; both should be equal to the number of topics in the model."
        )

    if len(doc_lengths) != dtds[0]:
        err(
            "Length of doc_lengths not equal to the number of rows in doc_topic_dists; both should be equal to the number of documents in the data."
        )

    W = len(vocab)
    if ttds[1] != W:
        err(
            "Number of terms in vocabulary does not match the number of columns of topic_term_dists (where each row of topic_term_dists is a probability distribution of terms for a given topic)."
        )
    if len(term_frequency) != W:
        err(
            "Length of term_frequency not equal to the number of terms in the vocabulary (len of vocab)."
        )

    if __num_dist_rows__(topic_term_dists) != ttds[0]:
        err("Not all rows (distributions) in topic_term_dists sum to 1.")

    if __num_dist_rows__(doc_topic_dists) != dtds[0]:
        err("Not all rows (distributions) in doc_topic_dists sum to 1.")

    if len(errors) > 0:
        return errors

`lexos.topic_model.mallet.scale_model._input_validate(*args)` ¤

Check input for scale_model.

Source code in lexos\topic_model\mallet\scale_model.py

def _input_validate(*args) -> None:
    """Check input for scale_model."""
    res = _input_check(*args)
    if res:
        raise ValidationError("\n" + "\n".join([" * " + s for s in res]))

`lexos.topic_model.mallet.scale_model._jensen_shannon(_P, _Q)` ¤

Calculate Jensen-Shannon Divergence.

Parameters:

Name	Type	Description	Default
`_P`	`np.array`	Probability distribution.	required
`_Q`	`np.array`	Probability distribution.	required

Returns:

Name	Type	Description
`float`	`float`	Jensen-Shannon Divergence.

Source code in lexos\topic_model\mallet\scale_model.py

def _jensen_shannon(_P: np.array, _Q: np.array) -> float:
    """Calculate Jensen-Shannon Divergence.

    Args:
        _P (np.array): Probability distribution.
        _Q (np.array): Probability distribution.

    Returns:
        float: Jensen-Shannon Divergence.
    """
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

`lexos.topic_model.mallet.scale_model._pcoa(pair_dists, n_components=2)` ¤

Perform Principal Coordinate Analysis.

AKA Classical Multidimensional Scaling Code referenced from skbio.stats.ordination.pcoa

Parameters:

Name	Type	Description	Default
`pair_dists`	`np.array`	Pairwise distances.	required
`n_components`	`int`	Number of dimensions to reduce to.	`2`

Returns:

Type	Description
`np.array`	np.array: PCoA matrix.

Source code in lexos\topic_model\mallet\scale_model.py

def _pcoa(pair_dists: np.array, n_components: int = 2) -> np.array:
    """Perform Principal Coordinate Analysis.

    AKA Classical Multidimensional Scaling
    Code referenced from [skbio.stats.ordination.pcoa](https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py)

    Args:
        pair_dists (np.array): Pairwise distances.
        n_components (int): Number of dimensions to reduce to.

    Returns:
        np.array: PCoA matrix.
    """
    # pairwise distance matrix is assumed symmetric
    pair_dists = np.asarray(pair_dists, np.float64)

    # perform SVD on double centred distance matrix
    n = pair_dists.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    B = -H.dot(pair_dists ** 2).dot(H) / 2
    eigvals, eigvecs = np.linalg.eig(B)

    # Take first n_components of eigenvalues and eigenvectors
    # sorted in decreasing order
    ix = eigvals.argsort()[::-1][:n_components]
    eigvals = eigvals[ix]
    eigvecs = eigvecs[:, ix]

    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
    # at least 1 eigenvalue must be zero
    eigvals[np.isclose(eigvals, 0)] = 0
    if np.any(eigvals < 0):
        ix_neg = eigvals < 0
        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

    return np.sqrt(eigvals) * eigvecs

`lexos.topic_model.mallet.scale_model.js_PCoA(distributions)` ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)

Parameters:

Name	Type	Description	Default
`distributions`	`np.array`	(array-like, shape (`n_dists`, `k`)): Matrix of distributions probabilities.	required

Returns:

Name	Type	Description
`pcoa`	`np.array`	(array, shape (`n_dists`, 2))

Source code in lexos\topic_model\mallet\scale_model.py

def js_PCoA(distributions: np.array) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Principal Coordinate Analysis
    (aka Classical Multidimensional Scaling)

    Args:
        distributions: (array-like, shape (`n_dists`, `k`)): Matrix of distributions probabilities.

    Returns:
        pcoa (np.array): (array, shape (`n_dists`, 2))

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    return _pcoa(dist_matrix)

`lexos.topic_model.mallet.scale_model.js_MMDS(distributions, **kwargs)` ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & Metric Multidimensional Scaling

Parameters:

Name	Type	Description	Default
`distributions`	`np.array`	Matrix of distributions probabilities (array-like, shape (`n_dists`, `k`)).	required
`**kwargs`	`dict`	Keyword argument to be passed to `sklearn.manifold.MDS()`	`{}`

Returns:

Name	Type	Description
`mmds`	`np.array`	(array, shape (`n_dists`, 2))

Source code in lexos\topic_model\mallet\scale_model.py

def js_MMDS(distributions: np.array, **kwargs) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Metric Multidimensional Scaling

    Args:
        distributions (np.array): Matrix of distributions probabilities (array-like, shape (`n_dists`, `k`)).
        **kwargs (dict): Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns:
        mmds (np.array): (array, shape (`n_dists`, 2))

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity="precomputed", **kwargs)
    return model.fit_transform(dist_matrix)

`lexos.topic_model.mallet.scale_model.js_TSNE(distributions, **kwargs)` ¤

Perform dimension reduction.

Works via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

Parameters:

Name	Type	Description	Default
`distributions`	`np.array`	Matrix of distributions probabilities (array-like, shape (`n_dists`, `k`)).	required
`**kwargs`	`dict`	Keyword argument to be passed to `sklearn.manifold.MDS()`	`{}`

Returns:

Name	Type	Description
`tsne`	`np.array`	(array, shape (`n_dists`, 2))

Source code in lexos\topic_model\mallet\scale_model.py

def js_TSNE(distributions, **kwargs) -> np.array:
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

    Args:
        distributions (np.array): Matrix of distributions probabilities  (array-like, shape (`n_dists`, `k`)).
        **kwargs (dict): Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns:
        tsne (np.array): (array, shape (`n_dists`, 2))
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = TSNE(n_components=2, random_state=0, metric="precomputed", **kwargs)
    return model.fit_transform(dist_matrix)

`lexos.topic_model.mallet.scale_model._df_with_names(data, index_name, columns_name)` ¤

Get a dataframe with names.

Parameters:

Name	Type	Description	Default
`data`	`pd.DataFrame`	Dataframe.	required
`index_name`	`str`	Name of index.	required
`columns_name`	`str`	Name of columns.	required

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: Dataframe with names.

Source code in lexos\topic_model\mallet\scale_model.py

def _df_with_names(data, index_name: str, columns_name: str) -> pd.DataFrame:
    """Get a dataframe with names.

    Args:
        data (pd.DataFrame): Dataframe.
        index_name (str): Name of index.
        columns_name (str): Name of columns.

    Returns:
        pd.DataFrame: Dataframe with names.
    """
    if isinstance(data, pd.DataFrame):
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

`lexos.topic_model.mallet.scale_model._series_with_name(data, name)` ¤

Get a series with name.

Parameters:

Name	Type	Description	Default
`data`	`pd.Series`	Series.	required
`name`	`str`	Name of series.	required

Returns:

Type	Description
`pd.Series`	pd.Series: Series with name.

Source code in lexos\topic_model\mallet\scale_model.py

def _series_with_name(data, name) -> pd.Series:
    """Get a series with name.

    Args:
        data (pd.Series): Series.
        name (str): Name of series.

    Returns:
        pd.Series: Series with name.
    """
    if isinstance(data, pd.Series):
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

`lexos.topic_model.mallet.scale_model._topic_coordinates(mds, topic_term_dists, topic_proportion)` ¤

Get coordinates for topics.

Parameters:

Name	Type	Description	Default
`mds`	array, shape (`n_dists`, 2	MDS coordinates.	required
`topic_term_dists`	array, shape (`n_topics`, `n_terms`	Topic-term distributions.	required
`topic_proportion`	array, shape (`n_topics`	Topic proportions.	required

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: Topic coordinates.

Source code in lexos\topic_model\mallet\scale_model.py

def _topic_coordinates(
    mds: np.array, topic_term_dists: np.array, topic_proportion: np.array
) -> pd.DataFrame:
    """Get coordinates for topics.

    Args:
        mds (array, shape (`n_dists`, 2)): MDS coordinates.
        topic_term_dists (array, shape (`n_topics`, `n_terms`)): Topic-term distributions.
        topic_proportion (array, shape (`n_topics`)): Topic proportions.

    Returns:
        pd.DataFrame: Topic coordinates.
    """
    K = topic_term_dists.shape[0]
    mds_res = mds(topic_term_dists)
    assert mds_res.shape == (K, 2)
    mds_df = pd.DataFrame(
        {
            "x": mds_res[:, 0],
            "y": mds_res[:, 1],
            "topics": range(1, K + 1),
            "cluster": 1,
            "Freq": topic_proportion * 100,
        }
    )
    # note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
    return mds_df

`lexos.topic_model.mallet.scale_model.get_topic_coordinates(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, mds=js_PCoA, sort_topics=True)` ¤

Transform the topic model distributions and related corpus.

Creates the data structures needed for topic bubbles.

Parameters:

Name	Type	Description	Default
`topic_term_dists`	array-like, shape (`n_topics`, `n_terms`	Matrix of topic-term probabilities where `n_terms` is `len(vocab)`.	required
`doc_topic_dists`	array-like, shape (`n_docs`, `n_topics`	Matrix of document-topic probabilities.	required
`doc_lengths`		(array-like, shape `n_docs`): The length of each document, i.e. the number of words in each document. The order of the numbers should be consistent with the ordering of the docs in `doc_topic_dists`.	required
`vocab`	array-like, shape `n_terms`	List of all the words in the corpus used to train the model.	required
`term_frequency`	array-like, shape `n_terms`	The count of each particular term over the entire corpus. The ordering of these counts should correspond with `vocab` and `topic_term_dists`.	required
`mds`	`Callable`	A function that takes `topic_term_dists` as an input and outputs a `n_topics` by `2` distance matrix. The output approximates the distance between topics. See `js_PCoA()` for details on the default function. A string representation currently accepts `pcoa` (or upper case variant), `mmds` (or upper case variant) and `tsne` (or upper case variant), if `sklearn` package is installed for the latter two.	`js_PCoA`
`sort_topics`	`bool`	Whether to sort topics by topic proportion (percentage of tokens covered). Set to `False` to to keep original topic order.	`True`

Returns:

Name	Type	Description
`scaled_coordinates`	`pd.DataFrame`	A pandas dataframe containing scaled x and y coordinates.

Source code in lexos\topic_model\mallet\scale_model.py

def get_topic_coordinates(
    topic_term_dists: np.array,
    doc_topic_dists: np.array,
    doc_lengths: list,
    vocab: list,
    term_frequency: list,
    mds: Callable = js_PCoA,
    sort_topics: bool = True,
) -> pd.DataFrame:
    """Transform the topic model distributions and related corpus.

    Creates the data structures needed for topic bubbles.

    Args:
        topic_term_dists (array-like, shape (`n_topics`, `n_terms`)): Matrix of topic-term probabilities where
            `n_terms` is `len(vocab)`.
        doc_topic_dists (array-like, shape (`n_docs`, `n_topics`)): Matrix of document-topic probabilities.
        doc_lengths : (array-like, shape `n_docs`): The length of each document, i.e. the number of words
            in each document. The order of the numbers should be consistent with the ordering of the docs in
            `doc_topic_dists`.
        vocab (array-like, shape `n_terms`): List of all the words in the corpus used to train the model.
        term_frequency (array-like, shape `n_terms`): The count of each particular term over the entire corpus.
            The ordering of these counts should correspond with `vocab` and `topic_term_dists`.
        mds (Callable): A function that takes `topic_term_dists` as an input and outputs a `n_topics` by `2`
            distance matrix. The output approximates the distance between topics. See `js_PCoA()` for details
            on the default function. A string representation currently accepts `pcoa` (or upper case variant),
            `mmds` (or upper case variant) and `tsne` (or upper case variant), if `sklearn` package is installed
            for the latter two.
        sort_topics (bool): Whether to sort topics by topic proportion (percentage of tokens covered). Set to
            `False` to to keep original topic order.

    Returns:
        scaled_coordinates (pd.DataFrame): A pandas dataframe containing scaled x and y coordinates.
    """
    # parse mds
    # if isinstance(mds, basestring):
    if isinstance(mds, (str, bytes)):
        mds = mds.lower()
        if mds == "pcoa":
            mds = js_PCoA
        elif mds in ("mmds", "tsne"):
            if sklearn_present:
                mds_opts = {"mmds": js_MMDS, "tsne": js_TSNE}
                mds = mds_opts[mds]
            else:
                logging.warning("sklearn not present, switch to PCoA")
                mds = js_PCoA
        else:
            logging.warning("Unknown mds `%s`, switch to PCoA" % mds)
            mds = js_PCoA

    topic_term_dists = _df_with_names(topic_term_dists, "topic", "term")
    doc_topic_dists = _df_with_names(doc_topic_dists, "doc", "topic")
    term_frequency = _series_with_name(term_frequency, "term_frequency")
    doc_lengths = _series_with_name(doc_lengths, "doc_length")
    vocab = _series_with_name(vocab, "vocab")
    _input_validate(
        topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency
    )

    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    if sort_topics:
        topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    else:
        topic_proportion = topic_freq / topic_freq.sum()

    topic_order = topic_proportion.index
    topic_term_dists = topic_term_dists.iloc[topic_order]

    scaled_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)

    return scaled_coordinates

`lexos.topic_model.mallet.scale_model.extract_params(statefile)` ¤

Extract the alpha and beta values from the statefile.

Parameters:

Name	Type	Description	Default
`statefile`	`str`	Path to statefile produced by MALLET.	required

Returns:

Name	Type	Description
`tuple`	`tuple`	A tuple of (alpha (list), beta)

Source code in lexos\topic_model\mallet\scale_model.py

def extract_params(statefile: str) -> tuple:
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.

    Returns:
        tuple: A tuple of (alpha (list), beta)
    """
    with gzip.open(statefile, "r") as state:
        params = [x.decode("utf8").strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))

`lexos.topic_model.mallet.scale_model.state_to_df(statefile)` ¤

Transform state file into pandas dataframe.

The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.

Parameters:

Name	Type	Description	Default
`statefile`	`str`	Path to statefile produced by MALLET.	required

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The topic assignment for each token in each document of the model.

Source code in lexos\topic_model\mallet\scale_model.py

def state_to_df(statefile: str) -> pd.DataFrame:
    """Transform state file into pandas dataframe.

    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.

    Args:
        statefile (str): Path to statefile produced by MALLET.

    Returns:
        pd.DataFrame: The topic assignment for each token in each document of the model.
    """
    return pd.read_csv(statefile, compression="gzip", sep=" ", skiprows=[1, 2])

`lexos.topic_model.mallet.scale_model.pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable)` ¤

Turn the pandas dataframe into a data matrix.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The aggregated dataframe.	required
`smooth_value`	`float`	Value to add to the matrix to account for the priors.	required
`rows_variable`	`str`	The name of the dataframe column to use as the rows in the matrix.	required
`cols_variable`	`str`	The name of the dataframe column to use as the columns in the matrix.	required
`values_variable`	`str`	The name of the dataframe column to use as the values in the matrix.	required

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: A pandas matrix that has been normalized on the rows.

Source code in lexos\topic_model\mallet\scale_model.py

def pivot_and_smooth(
    df: pd.DataFrame,
    smooth_value: float,
    rows_variable: str,
    cols_variable: str,
    values_variable: str,
) -> pd.DataFrame:
    """Turn the pandas dataframe into a data matrix.

    Args:
        df (pd.DataFrame): The aggregated dataframe.
        smooth_value (float): Value to add to the matrix to account for the priors.
        rows_variable (str): The name of the dataframe column to use as the rows in the matrix.
        cols_variable (str): The name of the dataframe column to use as the columns in the matrix.
        values_variable (str): The name of the dataframe column to use as the values in the matrix.

    Returns:
        pd.DataFrame: A pandas matrix that has been normalized on the rows.
    """
    matrix = df.pivot(
        index=rows_variable, columns=cols_variable, values=values_variable
    ).fillna(value=0)
    matrix = matrix.values + smooth_value

    normed = sklearn.preprocessing.normalize(matrix, norm="l1", axis=1)

    return pd.DataFrame(normed)

`lexos.topic_model.mallet.scale_model.convert_mallet_data(state_file)` ¤

Convert Mallet data to a structure compatible with pyLDAvis.

Parameters:

Name	Type	Description	Default
`state_file`	`string`	Mallet state file	required

Returns:

Name	Type	Description
`data`	`dict`	A dict containing pandas dataframes for the pyLDAvis prepare method.

Source code in lexos\topic_model\mallet\scale_model.py

def convert_mallet_data(state_file: str) -> dict:
    """Convert Mallet data to a structure compatible with pyLDAvis.

    Args:
        state_file (string): Mallet state file

    Returns:
        data (dict): A dict containing pandas dataframes for the pyLDAvis prepare method.
    """
    params = extract_params(state_file)
    alpha = [float(x) for x in params[0][1:]]
    beta = params[1]
    df = state_to_df(state_file)
    # Ensure that NaN is a string
    df["type"] = df.type.astype(str)
    # Get document lengths from statefile
    docs = df.groupby("#doc")["type"].count().reset_index(name="doc_length")
    # Get vocab and term frequencies from statefile
    vocab = df["type"].value_counts().reset_index()
    vocab.columns = ["type", "term_freq"]
    vocab = vocab.sort_values(by="type", ascending=True)
    phi_df = (
        df.groupby(["topic", "type"])["type"].count().reset_index(name="token_count")
    )
    phi_df = phi_df.sort_values(by="type", ascending=True)
    phi = pivot_and_smooth(phi_df, beta, "topic", "type", "token_count")
    theta_df = (
        df.groupby(["#doc", "topic"])["topic"].count().reset_index(name="topic_count")
    )
    theta = pivot_and_smooth(theta_df, alpha, "#doc", "topic", "topic_count")
    data = {
        "topic_term_dists": phi,
        "doc_topic_dists": theta,
        "doc_lengths": list(docs["doc_length"]),
        "vocab": list(vocab["type"]),
        "term_frequency": list(vocab["term_freq"]),
    }
    return data

`lexos.topic_model.dfr_browser.DfrBrowser` ¤

DfrBrowser class.

Source code in lexos\topic_model\dfr_browser\__init__.py

class DfrBrowser:
    """DfrBrowser class."""

    def __init__(
        self,
        model_dir: str = ".",
        model_state_file: str = "state.gz",
        model_scaled_file: str = "topic_scaled.csv",
        template_dir: str = TEMPLATE_DIR,
    ) -> None:
        """Initialize DfrBrowser object."""
        self.template_dir = template_dir
        self.model_dir = model_dir
        self.model_state_file = f"{model_dir}/{model_state_file}"
        self.model_scaled_file = f"{model_dir}/{model_scaled_file}"
        self.browser_dir = f"{model_dir}/dfr_browser"
        self.data_dir = f"{self.browser_dir}/data"
        self.num_topics = None  # How to get this?

        # Make a browser directory and copy the template into it
        if not Path(self.browser_dir).exists():
            self._copy_template()

        # Create dfr-browser files using python script
        self._prepare_data()

        # Copy scaled file into data dir
        shutil.copy(self.model_scaled_file, self.data_dir)

        # Move meta.csv to data_dir, zip up, and rename, delete meta.csv copy
        self._move_metadata()

        # Update assets
        self._update_assets()

    def _copy_template(self):
        """Copy the template directory to the browser directory."""
        try:
            shutil.copytree(Path(self.template_dir), Path(self.browser_dir))
        except FileNotFoundError as e:
            raise LexosException(f"Could not find dfr-browser template: {e}")

    def _prepare_data(self):
        """Prepare the data for the dfr-browser visualization."""
        Path(f"{self.data_dir}").mkdir(parents=True, exist_ok=True)
        prepare_data_script = f"python {self.browser_dir}/bin/prepare-data"
        cmd = " ".join(
            [
                prepare_data_script,
                "convert-state",
                self.model_state_file,
                "--tw",
                f"{self.data_dir}/tw.json",
                "--dt",
                f"{self.data_dir}/dt.json.zip",
            ]
        )
        cmd = shlex.split(cmd)
        try:
            output = check_output(
                cmd, stderr=STDOUT, shell=True, universal_newlines=True
            )
            print(output)
        except CalledProcessError as e:
            raise LexosException(e.output)
        cmd = " ".join(
            [prepare_data_script, "info-stub", "-o", f"{self.data_dir}/info.json"]
        )
        cmd = shlex.split(cmd)
        try:
            output = check_output(
                cmd, stderr=STDOUT, shell=True, universal_newlines=True
            )
            print(output)
        except CalledProcessError as e:
            raise LexosException(e.output)

    def _move_metadata(self):
        """Move meta.csv to data_dir, zip up, rename, and delete meta.csv copy."""
        meta_zip = f"{self.data_dir}/meta.csv.zip"
        if Path(meta_zip).exists():
            Path(meta_zip).unlink()
        browser_meta_file = f"{self.model_dir}/meta.csv"
        shutil.copy(browser_meta_file, self.data_dir)
        try:
            shutil.make_archive(
                f"{self.data_dir}/meta.csv", "zip", self.data_dir, "meta.csv"
            )
        except OSError as err:
            raise LexosException(f"Error writing meta.csv.zip: {err}")

    def _update_assets(self):
        """Update browser assets."""
        # Tweak default index.html to link to JSON, not JSTOR
        with open(f"{self.browser_dir}/index.html", "r") as f:
            filedata = f.read().replace("on JSTOR", "JSON")
        with open(f"{self.browser_dir}/index.html", "w") as f:
            f.write(filedata)
        # Tweak js file to link to the domain
        with open(
            f"{self.browser_dir}/js/dfb.min.js.custom", "r", encoding="utf-8"
        ) as f:
            filedata = f.read()
        pat = r"t\.select\(\"#doc_remark a\.url\"\).attr\(\"href\", .+?\);"
        new_pat = r'var doc_url = document.URL.split("modules")[0] + "project_data"; t.select("#doc_remark a.url")'
        new_pat += r'.attr("href", doc_url + "/" + e.url);'
        filedata = re.sub(pat, new_pat, filedata)
        with open(f"{self.browser_dir}/js/dfb.min.js", "w", encoding="utf-8") as f:
            f.write(filedata)

    def run(self, port: int = 8080) -> None:
        """Run the dfr-browser.

        This might work on the Jupyter port, but it might not.
        """
        # run_server = f"python {self.browser_dir}/bin/server"
        import os
        import sys
        import threading
        import time
        import webbrowser as w
        from http.server import HTTPServer, SimpleHTTPRequestHandler

        # set up the HTTP server and start it in a separate daemon thread
        httpd = HTTPServer(("localhost", port), SimpleHTTPRequestHandler)
        thread = threading.Thread(target=httpd.serve_forever)
        thread.daemon = True

        # if startup time is too long we might want to be able to quit the program
        current_dir = os.getcwd()
        try:
            os.chdir(self.browser_dir)
            thread.start()
        except KeyboardInterrupt:
            httpd.shutdown()
            os.chdir(current_dir)
            sys.exit(0)

        # wait until the webserver finished starting up (maybe wait longer or shorter...)
        time.sleep(3)

        # start sending requests
        w.open(f"http://127.0.0.1:{port}/")

`init(model_dir='.', model_state_file='state.gz', model_scaled_file='topic_scaled.csv', template_dir=TEMPLATE_DIR)` ¤

Initialize DfrBrowser object.

Source code in lexos\topic_model\dfr_browser\__init__.py

def __init__(
    self,
    model_dir: str = ".",
    model_state_file: str = "state.gz",
    model_scaled_file: str = "topic_scaled.csv",
    template_dir: str = TEMPLATE_DIR,
) -> None:
    """Initialize DfrBrowser object."""
    self.template_dir = template_dir
    self.model_dir = model_dir
    self.model_state_file = f"{model_dir}/{model_state_file}"
    self.model_scaled_file = f"{model_dir}/{model_scaled_file}"
    self.browser_dir = f"{model_dir}/dfr_browser"
    self.data_dir = f"{self.browser_dir}/data"
    self.num_topics = None  # How to get this?

    # Make a browser directory and copy the template into it
    if not Path(self.browser_dir).exists():
        self._copy_template()

    # Create dfr-browser files using python script
    self._prepare_data()

    # Copy scaled file into data dir
    shutil.copy(self.model_scaled_file, self.data_dir)

    # Move meta.csv to data_dir, zip up, and rename, delete meta.csv copy
    self._move_metadata()

    # Update assets
    self._update_assets()

`run(port=8080)` ¤

Run the dfr-browser.

This might work on the Jupyter port, but it might not.

Source code in lexos\topic_model\dfr_browser\__init__.py

def run(self, port: int = 8080) -> None:
    """Run the dfr-browser.

    This might work on the Jupyter port, but it might not.
    """
    # run_server = f"python {self.browser_dir}/bin/server"
    import os
    import sys
    import threading
    import time
    import webbrowser as w
    from http.server import HTTPServer, SimpleHTTPRequestHandler

    # set up the HTTP server and start it in a separate daemon thread
    httpd = HTTPServer(("localhost", port), SimpleHTTPRequestHandler)
    thread = threading.Thread(target=httpd.serve_forever)
    thread.daemon = True

    # if startup time is too long we might want to be able to quit the program
    current_dir = os.getcwd()
    try:
        os.chdir(self.browser_dir)
        thread.start()
    except KeyboardInterrupt:
        httpd.shutdown()
        os.chdir(current_dir)
        sys.exit(0)

    # wait until the webserver finished starting up (maybe wait longer or shorter...)
    time.sleep(3)

    # start sending requests
    w.open(f"http://127.0.0.1:{port}/")

Topic Model¤

lexos.topic_model.mallet.Mallet ¤

__init__(model_dir, mallet_path='mallet') ¤

import_data(docs, allowed=None, remove_stops=True, remove_punct=True, use_lemmas=False, **kwargs) ¤

scale(model_state_file=None, output_file=None) ¤

train(mallet_file=None, num_topics=20, num_iterations=1000, optimize_interval=10, random_seed=None, **kwargs) ¤

lexos.topic_model.mallet.scale_model.__num_dist_rows__(array, ndigits=2) ¤

lexos.topic_model.mallet.scale_model.ValidationError ¤

lexos.topic_model.mallet.scale_model._input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) ¤

lexos.topic_model.mallet.scale_model._input_validate(*args) ¤

lexos.topic_model.mallet.scale_model._jensen_shannon(_P, _Q) ¤

lexos.topic_model.mallet.scale_model._pcoa(pair_dists, n_components=2) ¤

lexos.topic_model.mallet.scale_model.js_PCoA(distributions) ¤

lexos.topic_model.mallet.scale_model.js_MMDS(distributions, **kwargs) ¤

lexos.topic_model.mallet.scale_model.js_TSNE(distributions, **kwargs) ¤

lexos.topic_model.mallet.scale_model._df_with_names(data, index_name, columns_name) ¤

lexos.topic_model.mallet.scale_model._series_with_name(data, name) ¤

lexos.topic_model.mallet.scale_model._topic_coordinates(mds, topic_term_dists, topic_proportion) ¤

lexos.topic_model.mallet.scale_model.get_topic_coordinates(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, mds=js_PCoA, sort_topics=True) ¤

lexos.topic_model.mallet.scale_model.extract_params(statefile) ¤

lexos.topic_model.mallet.scale_model.state_to_df(statefile) ¤

lexos.topic_model.mallet.scale_model.pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable) ¤

lexos.topic_model.mallet.scale_model.convert_mallet_data(state_file) ¤

lexos.topic_model.dfr_browser.DfrBrowser ¤

__init__(model_dir='.', model_state_file='state.gz', model_scaled_file='topic_scaled.csv', template_dir=TEMPLATE_DIR) ¤

run(port=8080) ¤

`lexos.topic_model.mallet.Mallet` ¤

`init(model_dir, mallet_path='mallet')` ¤

`import_data(docs, allowed=None, remove_stops=True, remove_punct=True, use_lemmas=False, **kwargs)` ¤

`scale(model_state_file=None, output_file=None)` ¤

`train(mallet_file=None, num_topics=20, num_iterations=1000, optimize_interval=10, random_seed=None, **kwargs)` ¤

`lexos.topic_model.mallet.scale_model.__num_dist_rows__(array, ndigits=2)` ¤

`lexos.topic_model.mallet.scale_model.ValidationError` ¤

`lexos.topic_model.mallet.scale_model._input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)` ¤

`lexos.topic_model.mallet.scale_model._input_validate(*args)` ¤

`lexos.topic_model.mallet.scale_model._jensen_shannon(_P, _Q)` ¤

`lexos.topic_model.mallet.scale_model._pcoa(pair_dists, n_components=2)` ¤

`lexos.topic_model.mallet.scale_model.js_PCoA(distributions)` ¤

`lexos.topic_model.mallet.scale_model.js_MMDS(distributions, **kwargs)` ¤

`lexos.topic_model.mallet.scale_model.js_TSNE(distributions, **kwargs)` ¤

`lexos.topic_model.mallet.scale_model._df_with_names(data, index_name, columns_name)` ¤

`lexos.topic_model.mallet.scale_model._series_with_name(data, name)` ¤

`lexos.topic_model.mallet.scale_model._topic_coordinates(mds, topic_term_dists, topic_proportion)` ¤

`lexos.topic_model.mallet.scale_model.get_topic_coordinates(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, mds=js_PCoA, sort_topics=True)` ¤

`lexos.topic_model.mallet.scale_model.extract_params(statefile)` ¤

`lexos.topic_model.mallet.scale_model.state_to_df(statefile)` ¤

`lexos.topic_model.mallet.scale_model.pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable)` ¤

`lexos.topic_model.mallet.scale_model.convert_mallet_data(state_file)` ¤

`lexos.topic_model.dfr_browser.DfrBrowser` ¤

`init(model_dir='.', model_state_file='state.gz', model_scaled_file='topic_scaled.csv', template_dir=TEMPLATE_DIR)` ¤

`run(port=8080)` ¤