Processors¤

`process_data(data: Any, docs: Optional[int | str | list[int] | list[str]] = None, limit: Optional[int] = Field(None, gt=0, description='Limit on number of terms to return')) -> dict[str, int]` ¤

Process any supported data type into a consistent format of term counts.

Parameters:

Name	Type	Description	Default
`data`	`Any`	The input data to process	required
`docs`	`Optional[int \| str \| list[int] \| list[str]]`	Optional document selection for multi-document data	`None`
`limit`	`Optional[int]`	Optional limit on number of terms to return	`Field(None, gt=0, description='Limit on number of terms to return')`

Returns:

Type	Description
`dict[str, int]`	dict[str, int]: Dictionary with terms as keys and counts as values

Raises:

Type	Description
`LexosException`	If data type is unsupported

Source code in lexos/visualization/processors.py

@validate_call(config=ConfigDict(allow_arbitrary_types=True))
def process_data(
    data: Any,
    docs: Optional[int | str | list[int] | list[str]] = None,
    limit: Optional[int] = Field(
        None, gt=0, description="Limit on number of terms to return"
    ),
) -> dict[str, int]:
    """Process any supported data type into a consistent format of term counts.

    Args:
        data: The input data to process
        docs: Optional document selection for multi-document data
        limit: Optional limit on number of terms to return

    Returns:
        dict[str, int]: Dictionary with terms as keys and counts as values

    Raises:
        LexosException: If data type is unsupported
    """
    # Handle simple string input
    if isinstance(data, str):
        counts = Counter(data.split())  # TODO: Use better tokenizer

    # Handle spaCy objects
    elif isinstance(data, (Doc, Span)):
        counts = Counter([token.text for token in data])

    # Handle dictionary input (already in correct format)
    elif isinstance(data, dict):
        counts = Counter(data)

    # Handle list inputs
    elif isinstance(data, list):
        counts = _process_list_data(data, docs)

    # Handle DTM objects
    elif isinstance(data, DTM):
        counts = process_dtm(data, docs)

    # Handle DataFrame objects
    elif isinstance(data, pd.DataFrame):
        counts = process_dataframe(data, docs)

    # Unsupported data type
    else:
        raise LexosException(
            f"Unsupported data type: {type(data)}. "
            "Supported types: str, dict, list, DTM, DataFrame, spaCy Doc/Span objects."
        )

    # WARNING: This renders the code unusable if the data contains float counts
    # such as topic model distributions. It doesn't seem necessary for any of
    # our current use cases, so I'm commenting it out for now.
    # Ensure counts are integers
    # counts = Counter({k: int(v) for k, v in counts.items()})

    # Limit the number of terms if specified
    if limit is not None:
        counts = Counter(dict(counts.most_common(limit)))

    return dict(counts)

`filter_docs(df: pd.DataFrame, docs: Optional[list[int] | list[str]] = None) -> pd.DataFrame` ¤

Filter the documents in a DTM.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A Document Term Matrix.	required
`docs`	`Optional[list[int] \| list[str]]`	A list of document indices or labels to filter the DTM.	`None`

Returns:

Type	Description
`DataFrame`	A filtered DTM.

Source code in lexos/visualization/processors.py

def filter_docs(
    df: pd.DataFrame, docs: Optional[list[int] | list[str]] = None
) -> pd.DataFrame:
    """Filter the documents in a DTM.

    Args:
        df: A Document Term Matrix.
        docs: A list of document indices or labels to filter the DTM.

    Returns:
        A filtered DTM.
    """
    if docs:
        if isinstance(docs[0], str):
            return df[docs]
        elif isinstance(docs[0], int):
            return df.iloc[:, docs]
    return df

`process_dataframe(df: pd.DataFrame, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter` ¤

Generate a term frequency dictionary from a DTM.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A Document Term Matrix object.	required
`docs`	`Optional[int \| str \| list[int] \| list[str]]`	A list of document indices or labels to filter the DTM.	`None`

Returns:

Name	Type	Description
`Counter`	`Counter`	A Counter object with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def process_dataframe(
    df: pd.DataFrame, docs: Optional[int | str | list[int] | list[str]] = None
) -> Counter:
    """Generate a term frequency dictionary from a DTM.

    Args:
        df (pd.DataFrame): A Document Term Matrix object.
        docs (Optional[int | str | list[int] | list[str]]): A list of document indices or labels to filter the DTM.

    Returns:
        Counter: A Counter object with the terms as keys and the counts as values.
    """
    # Filter the documents
    df = filter_docs(df, ensure_list(docs))
    # Add the counts
    df = df.copy()
    df["counts"] = df.sum(axis=1)
    # Remove terms with zero counts
    df = df.query("counts > 0")
    # Return the counts as a Counter
    return Counter(df["counts"].to_dict())

`process_dtm(dtm: DTM, docs: Optional[int | str | list[int] | list[str]] = None) -> dict[str, int]` ¤

Generate a term frequency dictionary from a DTM.

Parameters:

Name	Type	Description	Default
`dtm`	`DTM`	A Document Term Matrix object.	required
`docs`	`Optional[int \| str \| list[int] \| list[str]]`	A list of document indices or labels to filter the DTM.	`None`

Returns:

Type	Description
`dict[str, int]`	dict[str, int]: A dictionary with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def process_dtm(
    dtm: DTM, docs: Optional[int | str | list[int] | list[str]] = None
) -> dict[str, int]:
    """Generate a term frequency dictionary from a DTM.

    Args:
        dtm (DTM): A Document Term Matrix object.
        docs (Optional[int | str | list[int] | list[str]]): A list of document indices or labels to filter the DTM.

    Returns:
        dict[str, int]: A dictionary with the terms as keys and the counts as values.
    """
    df = dtm.to_df()
    # Filter the documents
    df = filter_docs(df, ensure_list(docs))
    return process_dataframe(df)

`process_list(data: list[list[Doc | Span] | list[str] | list[Token]], docs: Optional[int | list[int]]) -> Counter` ¤

Process a list of docs, spans, strings, or tokens.

Parameters:

Name	Type	Description	Default
`data`	`list[list[Doc \| Span] \| list[str] \| list[Token]]`	The data.	required
`docs`	`Optional[int \| list[int]]`	A list of document indices to be selected from the DTM.	required

Returns:

Name	Type	Description
`Counter`	`Counter`	A Counter object with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def process_list(
    data: list[list[Doc | Span] | list[str] | list[Token]],
    docs: Optional[int | list[int]],
) -> Counter:
    """Process a list of docs, spans, strings, or tokens.

    Args:
        data (list[list[Doc | Span] | list[str] | list[Token]]): The data.
        docs (Optional[int | list[int]]): A list of document indices to be selected from the DTM.

    Returns:
        Counter: A Counter object with the terms as keys and the counts as values.
    """
    if docs:
        # Filter the docs
        docs = ensure_list(docs)
        data = [item for i, item in enumerate(data) if i in docs]
        # Flatten the list
        data = list(chain(*data))
    # Get the terms
    if all(isinstance(item, str) for item in data):
        terms = [item for item in data]
    elif all(isinstance(item, Token) for item in data):
        terms = [item.text for item in data]
    elif all(isinstance(item, (Doc, Span)) for item in data):
        terms = [t.text for doc in data for t in doc]
    else:
        terms = list(chain(*data))
    return Counter(terms)

`process_docs(data: list[Doc] | list[Span], docs: Optional[int | list[int]]) -> Counter` ¤

Process multiple docs or spans.

Parameters:

Name	Type	Description	Default
`data`	`list[Doc] \| list[Span]`	The data.	required
`docs`	`Optional[int \| list[int]]`	A list of document indices to be selected from the DTM.	required

Returns:

Name	Type	Description
`Counter`	`Counter`	A Counter object with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def process_docs(
    data: list[Doc] | list[Span], docs: Optional[int | list[int]]
) -> Counter:
    """Process multiple docs or spans.

    Args:
        data (list[Doc] | list[Span]): The data.
        docs (Optional[int | list[int]]): A list of document indices to be selected from the DTM.

    Returns:
        Counter: A Counter object with the terms as keys and the counts as values.
    """
    if docs:
        # Filter the docs
        docs = ensure_list(docs)
        data = [item for i, item in enumerate(data) if i in docs]
    # Get the terms
    terms = [[token.text for token in doc] for doc in data]
    # Flatten the list
    terms = list(chain(*terms))
    return Counter(terms)

`process_item(data: Doc | Span | list[str] | list[Token]) -> Counter` ¤

Process single docs, spans, and strings, or flat lists of strings or tokens.

Parameters:

Name	Type	Description	Default
`data`	`Doc \| Span \| list[str] \| list[Token]`	The data.	required

Returns:

Type	Description
`Counter`	dict[str, int]: A dictionary with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def process_item(
    data: Doc | Span | list[str] | list[Token],
) -> Counter:
    """Process single docs, spans, and strings, or flat lists of strings or tokens.

    Args:
        data (Doc | Span | list[str] | list[Token]): The data.

    Returns:
        dict[str, int]: A dictionary with the terms as keys and the counts as values.
    """
    # Get the terms
    if isinstance(data, list) and isinstance(data[0], str):
        terms = [item for item in data]
    elif isinstance(data, list) and isinstance(data[0], Token):
        terms = [item.text for item in data]
    elif isinstance(data, (Doc, Span)):
        terms = [t.text for t in data]
    return Counter(terms)

`multicloud_processor(data: DTM | pd.DataFrame | list[Doc] | list[Span] | list[list[str]] | list[list[Token]] | list[dict[str, int]], docs: Optional[int | str | list[int] | list[str]] = None) -> list[dict[str, int]]` ¤

Process data into list of term-count dicts for multicloud visualization.

Parameters:

Name	Type	Description	Default
`data`	`DTM \| pd.DataFrame \| list[Doc] \| list[Span] \| list[list[str]] \| list[list[Token]] \| list[dict[str, int]]]`	The data.	required
`docs`	`Optional[int \| str \| list[int] \| list[str]]`	A list of document indices or labels to be selected from the DTM.	`None`

Returns:

Type	Description
`list[dict[str, int]]`	list[dict[str, int]]: A list of dictionaries with the terms as keys and the counts as values.

Source code in lexos/visualization/processors.py

@validate_call(
    config=ConfigDict(
        arbitrary_types_allowed=True, json_schema_extra=DocJSONSchema.schema()
    )
)
def multicloud_processor(
    data: DTM
    | pd.DataFrame
    | list[Doc]
    | list[Span]
    | list[list[str]]
    | list[list[Token]]
    | list[dict[str, int]],
    docs: Optional[int | str | list[int] | list[str]] = None,
) -> list[dict[str, int]]:
    """Process data into list of term-count dicts for multicloud visualization.

    Args:
        data (DTM | pd.DataFrame | list[Doc] | list[Span] | list[list[str]] | list[list[Token]] | list[dict[str, int]]]): The data.
        docs (Optional[int | str | list[int] | list[str]]): A list of document indices or labels to be selected from the DTM.

    Returns:
        list[dict[str, int]]: A list of dictionaries with the terms as keys and the counts as values.
    """
    # Convert DTM to DataFrame
    if isinstance(data, DTM):
        data = data.to_df()

    # Process DataFrame
    if isinstance(data, pd.DataFrame):
        df = filter_docs(data, ensure_list(docs))
        records = df.T.to_dict(orient="records")
        # Eliminate tokens with zero counts in each doc
        return [{k: v for k, v in record.items() if v != 0} for record in records]

    # Process other data types
    else:
        if docs:
            # Filter the docs
            docs = ensure_list(docs)
            if isinstance(docs[0], str):
                raise LexosException(
                    "Filtering by document labels is not yet supported for your data type. You may use list index numbers to select documents for processing."
                )
            else:
                data = [item for i, item in enumerate(data) if i in docs]
        try:
            # Docs and Spans
            if isinstance(data[0], (Doc, Span)):
                return [dict(Counter([token.text for token in doc])) for doc in data]

            # Lists of dicts
            elif isinstance(data, list) and isinstance(data[0], dict):
                return data

            # Lists of strings
            elif isinstance(data[0][0], str):
                return [dict(Counter(doc)) for doc in data]

            # Lists of Tokens
            elif isinstance(data[0][0], Token):
                return [dict(Counter([token.text for token in doc])) for doc in data]
        except IndexError:
            raise LexosException(
                "Data is empty or not in the expected format. "
                "Ensure you are passing a non-empty list of documents, spans, or strings."
            )

`get_rows(lst, n) -> Iterator[int]` ¤

Yield successive n-sized rows from a list of documents.

Parameters:

Name	Type	Description	Default
`lst`	`list`	A list of documents.	required
`n`	`int`	The number of columns in the row.	required

Yields:

Type	Description
`int`	A generator with the documents separated into rows.

Source code in lexos/visualization/processors.py

def get_rows(lst, n) -> Iterator[int]:
    """Yield successive n-sized rows from a list of documents.

    Args:
        lst (list): A list of documents.
        n (int): The number of columns in the row.

    Yields:
        A generator with the documents separated into rows.
    """
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

`_process_list_data(data: list, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter` ¤

Process list-type data inputs.

Parameters:

Name	Type	Description	Default
`data`	`list`	List data to process	required
`docs`	`Optional[int \| str \| list[int] \| list[str]]`	Optional document selection	`None`

Returns:

Name	Type	Description
`Counter`	`Counter`	Counter object with term counts

Source code in lexos/visualization/processors.py

def _process_list_data(
    data: list, docs: Optional[int | str | list[int] | list[str]] = None
) -> Counter:
    """Process list-type data inputs.

    Args:
        data: List data to process
        docs: Optional document selection

    Returns:
        Counter: Counter object with term counts
    """
    if not data:
        return Counter()

    # Ensure all items in the list are of the same type
    first_item = data[0]
    first_type = type(first_item)
    if not all(isinstance(x, first_type) for x in data):
        raise LexosException(
            f"Mixed types found in list: {first_type} and {[type(x) for x in data]}"
        )

    # List of lists
    if isinstance(first_item, list):
        return process_list(data, docs)

    # List of spaCy objects
    if isinstance(first_item, (Doc, Span)):
        return process_docs(data, docs)

    # List of tokens
    if isinstance(first_item, Token):
        return Counter([token.text for token in data])

    # Simple list of strings
    return process_item(data)

rendering:
  show_root_heading: true
  heading_level: 3

Processors¤

process_data(data: Any, docs: Optional[int | str | list[int] | list[str]] = None, limit: Optional[int] = Field(None, gt=0, description='Limit on number of terms to return')) -> dict[str, int] ¤

filter_docs(df: pd.DataFrame, docs: Optional[list[int] | list[str]] = None) -> pd.DataFrame ¤

process_dataframe(df: pd.DataFrame, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter ¤

process_dtm(dtm: DTM, docs: Optional[int | str | list[int] | list[str]] = None) -> dict[str, int] ¤

process_list(data: list[list[Doc | Span] | list[str] | list[Token]], docs: Optional[int | list[int]]) -> Counter ¤

process_docs(data: list[Doc] | list[Span], docs: Optional[int | list[int]]) -> Counter ¤

process_item(data: Doc | Span | list[str] | list[Token]) -> Counter ¤

multicloud_processor(data: DTM | pd.DataFrame | list[Doc] | list[Span] | list[list[str]] | list[list[Token]] | list[dict[str, int]], docs: Optional[int | str | list[int] | list[str]] = None) -> list[dict[str, int]] ¤

get_rows(lst, n) -> Iterator[int] ¤

_process_list_data(data: list, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter ¤

`process_data(data: Any, docs: Optional[int | str | list[int] | list[str]] = None, limit: Optional[int] = Field(None, gt=0, description='Limit on number of terms to return')) -> dict[str, int]` ¤

`filter_docs(df: pd.DataFrame, docs: Optional[list[int] | list[str]] = None) -> pd.DataFrame` ¤

`process_dataframe(df: pd.DataFrame, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter` ¤

`process_dtm(dtm: DTM, docs: Optional[int | str | list[int] | list[str]] = None) -> dict[str, int]` ¤

`process_list(data: list[list[Doc | Span] | list[str] | list[Token]], docs: Optional[int | list[int]]) -> Counter` ¤

`process_docs(data: list[Doc] | list[Span], docs: Optional[int | list[int]]) -> Counter` ¤

`process_item(data: Doc | Span | list[str] | list[Token]) -> Counter` ¤

`multicloud_processor(data: DTM | pd.DataFrame | list[Doc] | list[Span] | list[list[str]] | list[list[Token]] | list[dict[str, int]], docs: Optional[int | str | list[int] | list[str]] = None) -> list[dict[str, int]]` ¤

`get_rows(lst, n) -> Iterator[int]` ¤

`_process_list_data(data: list, docs: Optional[int | str | list[int] | list[str]] = None) -> Counter` ¤