KWIC¤

`Kwic` `pydantic-model` ¤

Bases: BaseModel

Class for finding keywords in context (KWIC) in text or spaCy documents.

Config:

arbitrary_types_allowed: True

Fields:

nlp (Optional[str])
alg (Optional[ns])

Source code in lexos/kwic/__init__.py

class Kwic(BaseModel):
    """Class for finding keywords in context (KWIC) in text or spaCy documents."""

    nlp: Optional[str] = Field(
        default="xx_sent_ud_sm", description="The spaCy model to use for tokenization."
    )
    alg: Optional[ns] = Field(
        default=ns.LOCALE, description="The sorting algorithm to use."
    )

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self, **data):
        """Initialize the Kwic class with a spaCy model."""
        super().__init__(**data)
        self.nlp = spacy.load(self.nlp)

        # Make sure the sorting algorithm is valid
        self._validate_sorting_algorithm()

    def __call__(
        self,
        docs: Optional[Doc | str | list[Doc | str]] = Field(
            default_factory=list,
            description="The spaCy Doc(s) or string(s) to search within.",
        ),
        labels: Optional[str | list[str]] = Field(
            None,
            description="A list of labels for the documents. Defaults to None.",
        ),
        patterns: list = Field(
            default_factory=list,
            description="A list of patterns to match. Can be regex strings or spaCy token patterns.",
        ),
        window: Optional[int] = Field(
            50,
            description="The number of tokens or characters to include before and after the match.",
        ),
        matcher: Optional[str] = Field(
            "characters",
            description="The type of matcher to use. Can be 'rule' for spaCy Matcher, 'phrase' for PhraseMatcher, 'tokens' for token patterns, or 'characters' for string matching.",
        ),
        case_sensitive: Optional[bool] = Field(
            False,
            description="If True, the matching will be case-sensitive. Defaults to False.",
        ),
        use_regex: Optional[bool] = Field(
            False,
            description="If True, use regex for matching with the 'tokens' setting. Defaults to False.",
        ),
        as_df: Optional[bool] = Field(
            True,
            description="If True, return results as a pandas DataFrame. Defaults to True.",
        ),
        sort_by: Optional[str] = Field(
            "keyword",  # Make sure this matches the column name exactly
            description="The column to sort the results by if as_df is True. Defaults to 'keyword'.",
        ),
        ascending: Optional[bool] = Field(
            True,
            description="If True, sort in ascending order. Defaults to True.",
        ),
    ) -> list[tuple[str, str, str]] | pd.DataFrame:
        """Call the Kwic instance to find keywords in context.

        Returns:
            list: A list of tuples, each containing the context before, the matched keyword,
                and the context after, or a DataFrame with the same content.
        """
        # Validate input types
        if matcher in ["rule", "phrase", "tokens"] and any(
            isinstance(doc, str) for doc in docs
        ):
            raise LexosException(
                "Docs must be spaCy Doc objects when using 'rule', 'phrase', or 'tokens' matcher. To search raw text strings, use the 'characters' matcher type, setting `use_regex` if you wish to use regex patterns."
            )

        # Ensure that docs and labels are lists of equal length
        docs = ensure_list(docs)
        if isinstance(labels, list):
            labels = ensure_list(labels)
            if len(docs) != len(labels) and labels:
                raise LexosException(
                    "The number of documents and labels must match. If you do not want to label the documents, set `labels` to None."
                )
        else:
            labels = [f"Doc {i + 1}" for i in range(len(docs))]

        # Assign search parameters and call match method
        match matcher:
            case "rule":
                matcher = Matcher(self.nlp.vocab)
                matcher.add("KWIC_PATTERNS", patterns)
                hits = self._match_tokens(docs, labels, window, matcher)
            case "phrase":
                if case_sensitive:
                    matcher = PhraseMatcher(self.nlp.vocab)
                else:
                    matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
                patterns = [self.nlp.make_doc(phrase) for phrase in patterns]
                matcher.add("KWIC_PATTERNS", patterns)
                hits = self._match_tokens(docs, labels, window, matcher)
            case "tokens":
                matcher = Matcher(self.nlp.vocab)
                patterns = ensure_list(patterns)
                patterns = self._convert_patterns_to_spacy(
                    patterns, case_sensitive, use_regex
                )
                matcher.add("KWIC_PATTERNS", patterns)
                hits = self._match_tokens(docs, labels, window, matcher)
            case _:
                docs = [doc.text if isinstance(doc, Doc) else doc for doc in docs]
                patterns = ensure_list(patterns)
                hits = list(
                    self._match_strings(
                        docs, labels, patterns, window, case_sensitive=case_sensitive
                    )
                )

        # Convert hits to DataFrame for sorting
        df = pd.DataFrame(
            hits, columns=["doc", "context_before", "keyword", "context_after"]
        )

        # Only sort if we have data and the sort_by column exists
        if not df.empty and sort_by in df.columns:
            df = df.sort_values(
                by=sort_by, ascending=ascending, key=natsort_keygen(alg=self.alg)
            )

        # If as_df is False, convert DataFrame to list of dictionaries
        if not as_df:
            result = list(df.to_records(index=False))
            return [tuple(item) for item in result]

        return df

    def _convert_patterns_to_spacy(
        self, patterns: list, case_sensitive: bool, use_regex: bool
    ) -> list:
        """Convert a list of string patterns to spaCy token patterns.

        Args:
            patterns (list): A list of string patterns to convert.
            case_sensitive (bool): If True, the patterns will be case-sensitive.
            use_regex (bool): If True, the patterns will be treated as regex patterns.

        Returns:
            list: A list of spaCy token patterns.
        """
        if use_regex:
            if case_sensitive:
                return [[{"TEXT": {"REGEX": pattern}}] for pattern in patterns]
            else:
                return [[{"LOWER": {"REGEX": pattern.lower()}}] for pattern in patterns]
        else:
            if case_sensitive:
                return [[{"TEXT": pattern}] for pattern in patterns]
            else:
                return [[{"LOWER": pattern}] for pattern in patterns]

    def _match_strings(
        self,
        docs: list[str],
        labels: list[str],
        patterns: list,
        window: int,
        case_sensitive: bool,
    ):
        """Match keywords in a string and return their context.

        Args:
            docs (list[str]): The text to search within.
            labels (str): A list of labels for the documents.
            patterns (list): A list of regex patterns to match.
            window (int): The number of characters to include before and after the match.
            case_sensitive (bool): If True, the matching will be case-sensitive.

        Yields:
            tuple (tuple): A tuple containing the context before, the matched keyword, and the context after.
        """
        flags = 0 if case_sensitive else re.IGNORECASE
        for i, doc in enumerate(docs):
            for pattern in patterns:
                for match in re.finditer(pattern, doc, flags=flags):
                    start = match.start()
                    end = match.end()
                    context_start = max(0, start - window)
                    context_end = min(len(doc), end + window)
                    context_before = doc[context_start:start]
                    context_after = doc[end:context_end]
                    yield (labels[i], context_before, match.group(), context_after)

    def _match_tokens(
        self, docs: list[Doc], labels: list[str], window: int, matcher: Matcher
    ) -> list[tuple[str, str, str, str]]:
        """Match keywords in a spaCy Doc and return their context.

        Args:
            docs (list[Doc]): The spaCy Doc(s) to search within.
            labels (list[str]): A list of labels for the documents.
            window (int): The number of tokens to include before and after the match.
            matcher (Matcher): The spaCy Matcher object with patterns added.

        Returns:
            list[tuple[str, str, str, str]]: A list of tuples, each containing the context before, the matched keyword, and the context after.
        """
        hits = []  # List to store the hits
        for i, doc in enumerate(docs):
            matches = matcher(doc)
            for _, start, end in matches:
                span = doc[start:end]  # The matched span (keyword)
                context_start = max(0, start - window)  #  Start of context window
                context_end = min(len(doc), end + window)  # End of context window
                context_before = doc[context_start : span.start]
                context_after = doc[span.end : context_end]  # Fixed indentation
                hits.append(
                    (labels[i], context_before.text, span.text, context_after.text)
                )  # Fixed indentation
        return hits

    def _validate_sorting_algorithm(self) -> bool:
        """Ensure that the specified sorting algorithm is a valid natsort locale.

        Returns:
            bool: Whether the sorting algorithm is valid.
        """
        if self.alg not in [e for e in ns]:
            locales = ", ".join([f"ns.{e.name}" for e in ns])
            err = (
                f"Invalid sorting algorithm: {self.alg}.",
                f"Valid algorithms for `alg` are: {locales}.",
                "See https://natsort.readthedocs.io/en/stable/api.html#natsort.ns.",
            )
            raise LexosException(" ".join(err))
        return True

`alg: Optional[ns] = ns.LOCALE` `pydantic-field` ¤

The sorting algorithm to use.

call(docs: Optional[Doc | str | list[Doc | str]] = Field(default_factory=list, description='The spaCy Doc(s) or string(s) to search within.'), labels: Optional[str | list[str]] = Field(None, description='A list of labels for the documents. Defaults to None.'), patterns: list = Field(default_factory=list, description='A list of patterns to match. Can be regex strings or spaCy token patterns.'), window: Optional[int] = Field(50, description='The number of tokens or characters to include before and after the match.'), matcher: Optional[str] = Field('characters', description="The type of matcher to use. Can be 'rule' for spaCy Matcher, 'phrase' for PhraseMatcher, 'tokens' for token patterns, or 'characters' for string matching."), case_sensitive: Optional[bool] = Field(False, description='If True, the matching will be case-sensitive. Defaults to False.'), use_regex: Optional[bool] = Field(False, description="If True, use regex for matching with the 'tokens' setting. Defaults to False."), as_df: Optional[bool] = Field(True, description='If True, return results as a pandas DataFrame. Defaults to True.'), sort_by: Optional[str] = Field('keyword', description="The column to sort the results by if as_df is True. Defaults to 'keyword'."), ascending: Optional[bool] = Field(True, description='If True, sort in ascending order. Defaults to True.')) -> list[tuple[str, str, str]] | pd.DataFrame ¤

Call the Kwic instance to find keywords in context.

Returns:

Name	Type	Description
`list`	`list[tuple[str, str, str]] \| DataFrame`	A list of tuples, each containing the context before, the matched keyword, and the context after, or a DataFrame with the same content.

Source code in lexos/kwic/__init__.py

def __call__(
    self,
    docs: Optional[Doc | str | list[Doc | str]] = Field(
        default_factory=list,
        description="The spaCy Doc(s) or string(s) to search within.",
    ),
    labels: Optional[str | list[str]] = Field(
        None,
        description="A list of labels for the documents. Defaults to None.",
    ),
    patterns: list = Field(
        default_factory=list,
        description="A list of patterns to match. Can be regex strings or spaCy token patterns.",
    ),
    window: Optional[int] = Field(
        50,
        description="The number of tokens or characters to include before and after the match.",
    ),
    matcher: Optional[str] = Field(
        "characters",
        description="The type of matcher to use. Can be 'rule' for spaCy Matcher, 'phrase' for PhraseMatcher, 'tokens' for token patterns, or 'characters' for string matching.",
    ),
    case_sensitive: Optional[bool] = Field(
        False,
        description="If True, the matching will be case-sensitive. Defaults to False.",
    ),
    use_regex: Optional[bool] = Field(
        False,
        description="If True, use regex for matching with the 'tokens' setting. Defaults to False.",
    ),
    as_df: Optional[bool] = Field(
        True,
        description="If True, return results as a pandas DataFrame. Defaults to True.",
    ),
    sort_by: Optional[str] = Field(
        "keyword",  # Make sure this matches the column name exactly
        description="The column to sort the results by if as_df is True. Defaults to 'keyword'.",
    ),
    ascending: Optional[bool] = Field(
        True,
        description="If True, sort in ascending order. Defaults to True.",
    ),
) -> list[tuple[str, str, str]] | pd.DataFrame:
    """Call the Kwic instance to find keywords in context.

    Returns:
        list: A list of tuples, each containing the context before, the matched keyword,
            and the context after, or a DataFrame with the same content.
    """
    # Validate input types
    if matcher in ["rule", "phrase", "tokens"] and any(
        isinstance(doc, str) for doc in docs
    ):
        raise LexosException(
            "Docs must be spaCy Doc objects when using 'rule', 'phrase', or 'tokens' matcher. To search raw text strings, use the 'characters' matcher type, setting `use_regex` if you wish to use regex patterns."
        )

    # Ensure that docs and labels are lists of equal length
    docs = ensure_list(docs)
    if isinstance(labels, list):
        labels = ensure_list(labels)
        if len(docs) != len(labels) and labels:
            raise LexosException(
                "The number of documents and labels must match. If you do not want to label the documents, set `labels` to None."
            )
    else:
        labels = [f"Doc {i + 1}" for i in range(len(docs))]

    # Assign search parameters and call match method
    match matcher:
        case "rule":
            matcher = Matcher(self.nlp.vocab)
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case "phrase":
            if case_sensitive:
                matcher = PhraseMatcher(self.nlp.vocab)
            else:
                matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
            patterns = [self.nlp.make_doc(phrase) for phrase in patterns]
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case "tokens":
            matcher = Matcher(self.nlp.vocab)
            patterns = ensure_list(patterns)
            patterns = self._convert_patterns_to_spacy(
                patterns, case_sensitive, use_regex
            )
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case _:
            docs = [doc.text if isinstance(doc, Doc) else doc for doc in docs]
            patterns = ensure_list(patterns)
            hits = list(
                self._match_strings(
                    docs, labels, patterns, window, case_sensitive=case_sensitive
                )
            )

    # Convert hits to DataFrame for sorting
    df = pd.DataFrame(
        hits, columns=["doc", "context_before", "keyword", "context_after"]
    )

    # Only sort if we have data and the sort_by column exists
    if not df.empty and sort_by in df.columns:
        df = df.sort_values(
            by=sort_by, ascending=ascending, key=natsort_keygen(alg=self.alg)
        )

    # If as_df is False, convert DataFrame to list of dictionaries
    if not as_df:
        result = list(df.to_records(index=False))
        return [tuple(item) for item in result]

    return df

`init(**data)` ¤

Initialize the Kwic class with a spaCy model.

Source code in lexos/kwic/__init__.py

def __init__(self, **data):
    """Initialize the Kwic class with a spaCy model."""
    super().__init__(**data)
    self.nlp = spacy.load(self.nlp)

    # Make sure the sorting algorithm is valid
    self._validate_sorting_algorithm()

rendering:
  show_root_heading: true
  heading_level: 3

`nlp: Optional[str]` `pydantic-field` ¤

rendering:
  show_root_heading: true
  heading_level: 3

`alg: Optional[ns] = ns.LOCALE` `pydantic-field` ¤

The sorting algorithm to use.

rendering:
  show_root_heading: true
  heading_level: 3

`model_config = ConfigDict(arbitrary_types_allowed=True)` `class-attribute` `instance-attribute` ¤

rendering:
  show_root_heading: true
  heading_level: 3

`init(**data)` ¤

Initialize the Kwic class with a spaCy model.

Source code in lexos/kwic/__init__.py

def __init__(self, **data):
    """Initialize the Kwic class with a spaCy model."""
    super().__init__(**data)
    self.nlp = spacy.load(self.nlp)

    # Make sure the sorting algorithm is valid
    self._validate_sorting_algorithm()

rendering:
  show_root_heading: true
  heading_level: 3

call(docs: Optional[Doc | str | list[Doc | str]] = Field(default_factory=list, description='The spaCy Doc(s) or string(s) to search within.'), labels: Optional[str | list[str]] = Field(None, description='A list of labels for the documents. Defaults to None.'), patterns: list = Field(default_factory=list, description='A list of patterns to match. Can be regex strings or spaCy token patterns.'), window: Optional[int] = Field(50, description='The number of tokens or characters to include before and after the match.'), matcher: Optional[str] = Field('characters', description="The type of matcher to use. Can be 'rule' for spaCy Matcher, 'phrase' for PhraseMatcher, 'tokens' for token patterns, or 'characters' for string matching."), case_sensitive: Optional[bool] = Field(False, description='If True, the matching will be case-sensitive. Defaults to False.'), use_regex: Optional[bool] = Field(False, description="If True, use regex for matching with the 'tokens' setting. Defaults to False."), as_df: Optional[bool] = Field(True, description='If True, return results as a pandas DataFrame. Defaults to True.'), sort_by: Optional[str] = Field('keyword', description="The column to sort the results by if as_df is True. Defaults to 'keyword'."), ascending: Optional[bool] = Field(True, description='If True, sort in ascending order. Defaults to True.')) -> list[tuple[str, str, str]] | pd.DataFrame ¤

Call the Kwic instance to find keywords in context.

Returns:

Name	Type	Description
`list`	`list[tuple[str, str, str]] \| DataFrame`	A list of tuples, each containing the context before, the matched keyword, and the context after, or a DataFrame with the same content.

Source code in lexos/kwic/__init__.py

def __call__(
    self,
    docs: Optional[Doc | str | list[Doc | str]] = Field(
        default_factory=list,
        description="The spaCy Doc(s) or string(s) to search within.",
    ),
    labels: Optional[str | list[str]] = Field(
        None,
        description="A list of labels for the documents. Defaults to None.",
    ),
    patterns: list = Field(
        default_factory=list,
        description="A list of patterns to match. Can be regex strings or spaCy token patterns.",
    ),
    window: Optional[int] = Field(
        50,
        description="The number of tokens or characters to include before and after the match.",
    ),
    matcher: Optional[str] = Field(
        "characters",
        description="The type of matcher to use. Can be 'rule' for spaCy Matcher, 'phrase' for PhraseMatcher, 'tokens' for token patterns, or 'characters' for string matching.",
    ),
    case_sensitive: Optional[bool] = Field(
        False,
        description="If True, the matching will be case-sensitive. Defaults to False.",
    ),
    use_regex: Optional[bool] = Field(
        False,
        description="If True, use regex for matching with the 'tokens' setting. Defaults to False.",
    ),
    as_df: Optional[bool] = Field(
        True,
        description="If True, return results as a pandas DataFrame. Defaults to True.",
    ),
    sort_by: Optional[str] = Field(
        "keyword",  # Make sure this matches the column name exactly
        description="The column to sort the results by if as_df is True. Defaults to 'keyword'.",
    ),
    ascending: Optional[bool] = Field(
        True,
        description="If True, sort in ascending order. Defaults to True.",
    ),
) -> list[tuple[str, str, str]] | pd.DataFrame:
    """Call the Kwic instance to find keywords in context.

    Returns:
        list: A list of tuples, each containing the context before, the matched keyword,
            and the context after, or a DataFrame with the same content.
    """
    # Validate input types
    if matcher in ["rule", "phrase", "tokens"] and any(
        isinstance(doc, str) for doc in docs
    ):
        raise LexosException(
            "Docs must be spaCy Doc objects when using 'rule', 'phrase', or 'tokens' matcher. To search raw text strings, use the 'characters' matcher type, setting `use_regex` if you wish to use regex patterns."
        )

    # Ensure that docs and labels are lists of equal length
    docs = ensure_list(docs)
    if isinstance(labels, list):
        labels = ensure_list(labels)
        if len(docs) != len(labels) and labels:
            raise LexosException(
                "The number of documents and labels must match. If you do not want to label the documents, set `labels` to None."
            )
    else:
        labels = [f"Doc {i + 1}" for i in range(len(docs))]

    # Assign search parameters and call match method
    match matcher:
        case "rule":
            matcher = Matcher(self.nlp.vocab)
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case "phrase":
            if case_sensitive:
                matcher = PhraseMatcher(self.nlp.vocab)
            else:
                matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
            patterns = [self.nlp.make_doc(phrase) for phrase in patterns]
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case "tokens":
            matcher = Matcher(self.nlp.vocab)
            patterns = ensure_list(patterns)
            patterns = self._convert_patterns_to_spacy(
                patterns, case_sensitive, use_regex
            )
            matcher.add("KWIC_PATTERNS", patterns)
            hits = self._match_tokens(docs, labels, window, matcher)
        case _:
            docs = [doc.text if isinstance(doc, Doc) else doc for doc in docs]
            patterns = ensure_list(patterns)
            hits = list(
                self._match_strings(
                    docs, labels, patterns, window, case_sensitive=case_sensitive
                )
            )

    # Convert hits to DataFrame for sorting
    df = pd.DataFrame(
        hits, columns=["doc", "context_before", "keyword", "context_after"]
    )

    # Only sort if we have data and the sort_by column exists
    if not df.empty and sort_by in df.columns:
        df = df.sort_values(
            by=sort_by, ascending=ascending, key=natsort_keygen(alg=self.alg)
        )

    # If as_df is False, convert DataFrame to list of dictionaries
    if not as_df:
        result = list(df.to_records(index=False))
        return [tuple(item) for item in result]

    return df

rendering:
  show_root_heading: true
  heading_level: 3

`_convert_patterns_to_spacy(patterns: list, case_sensitive: bool, use_regex: bool) -> list` ¤

Convert a list of string patterns to spaCy token patterns.

Parameters:

Name	Type	Description	Default
`patterns`	`list`	A list of string patterns to convert.	required
`case_sensitive`	`bool`	If True, the patterns will be case-sensitive.	required
`use_regex`	`bool`	If True, the patterns will be treated as regex patterns.	required

Returns:

Name	Type	Description
`list`	`list`	A list of spaCy token patterns.

Source code in lexos/kwic/__init__.py

def _convert_patterns_to_spacy(
    self, patterns: list, case_sensitive: bool, use_regex: bool
) -> list:
    """Convert a list of string patterns to spaCy token patterns.

    Args:
        patterns (list): A list of string patterns to convert.
        case_sensitive (bool): If True, the patterns will be case-sensitive.
        use_regex (bool): If True, the patterns will be treated as regex patterns.

    Returns:
        list: A list of spaCy token patterns.
    """
    if use_regex:
        if case_sensitive:
            return [[{"TEXT": {"REGEX": pattern}}] for pattern in patterns]
        else:
            return [[{"LOWER": {"REGEX": pattern.lower()}}] for pattern in patterns]
    else:
        if case_sensitive:
            return [[{"TEXT": pattern}] for pattern in patterns]
        else:
            return [[{"LOWER": pattern}] for pattern in patterns]

rendering:
  show_root_heading: true
  heading_level: 3

`_match_strings(docs: list[str], labels: list[str], patterns: list, window: int, case_sensitive: bool)` ¤

Match keywords in a string and return their context.

Parameters:

Name	Type	Description	Default
`docs`	`list[str]`	The text to search within.	required
`labels`	`str`	A list of labels for the documents.	required
`patterns`	`list`	A list of regex patterns to match.	required
`window`	`int`	The number of characters to include before and after the match.	required
`case_sensitive`	`bool`	If True, the matching will be case-sensitive.	required

Yields:

Name	Type	Description
`tuple`	`tuple`	A tuple containing the context before, the matched keyword, and the context after.

Source code in lexos/kwic/__init__.py

def _match_strings(
    self,
    docs: list[str],
    labels: list[str],
    patterns: list,
    window: int,
    case_sensitive: bool,
):
    """Match keywords in a string and return their context.

    Args:
        docs (list[str]): The text to search within.
        labels (str): A list of labels for the documents.
        patterns (list): A list of regex patterns to match.
        window (int): The number of characters to include before and after the match.
        case_sensitive (bool): If True, the matching will be case-sensitive.

    Yields:
        tuple (tuple): A tuple containing the context before, the matched keyword, and the context after.
    """
    flags = 0 if case_sensitive else re.IGNORECASE
    for i, doc in enumerate(docs):
        for pattern in patterns:
            for match in re.finditer(pattern, doc, flags=flags):
                start = match.start()
                end = match.end()
                context_start = max(0, start - window)
                context_end = min(len(doc), end + window)
                context_before = doc[context_start:start]
                context_after = doc[end:context_end]
                yield (labels[i], context_before, match.group(), context_after)

rendering:
  show_root_heading: true
  heading_level: 3

`_match_tokens(docs: list[Doc], labels: list[str], window: int, matcher: Matcher) -> list[tuple[str, str, str, str]]` ¤

Match keywords in a spaCy Doc and return their context.

Parameters:

Name	Type	Description	Default
`docs`	`list[Doc]`	The spaCy Doc(s) to search within.	required
`labels`	`list[str]`	A list of labels for the documents.	required
`window`	`int`	The number of tokens to include before and after the match.	required
`matcher`	`Matcher`	The spaCy Matcher object with patterns added.	required

Returns:

Type	Description
`list[tuple[str, str, str, str]]`	list[tuple[str, str, str, str]]: A list of tuples, each containing the context before, the matched keyword, and the context after.

Source code in lexos/kwic/__init__.py

def _match_tokens(
    self, docs: list[Doc], labels: list[str], window: int, matcher: Matcher
) -> list[tuple[str, str, str, str]]:
    """Match keywords in a spaCy Doc and return their context.

    Args:
        docs (list[Doc]): The spaCy Doc(s) to search within.
        labels (list[str]): A list of labels for the documents.
        window (int): The number of tokens to include before and after the match.
        matcher (Matcher): The spaCy Matcher object with patterns added.

    Returns:
        list[tuple[str, str, str, str]]: A list of tuples, each containing the context before, the matched keyword, and the context after.
    """
    hits = []  # List to store the hits
    for i, doc in enumerate(docs):
        matches = matcher(doc)
        for _, start, end in matches:
            span = doc[start:end]  # The matched span (keyword)
            context_start = max(0, start - window)  #  Start of context window
            context_end = min(len(doc), end + window)  # End of context window
            context_before = doc[context_start : span.start]
            context_after = doc[span.end : context_end]  # Fixed indentation
            hits.append(
                (labels[i], context_before.text, span.text, context_after.text)
            )  # Fixed indentation
    return hits

rendering:
  show_root_heading: true
  heading_level: 3

`_validate_sorting_algorithm() -> bool` ¤

Ensure that the specified sorting algorithm is a valid natsort locale.

Returns:

Name	Type	Description
`bool`	`bool`	Whether the sorting algorithm is valid.

Source code in lexos/kwic/__init__.py

def _validate_sorting_algorithm(self) -> bool:
    """Ensure that the specified sorting algorithm is a valid natsort locale.

    Returns:
        bool: Whether the sorting algorithm is valid.
    """
    if self.alg not in [e for e in ns]:
        locales = ", ".join([f"ns.{e.name}" for e in ns])
        err = (
            f"Invalid sorting algorithm: {self.alg}.",
            f"Valid algorithms for `alg` are: {locales}.",
            "See https://natsort.readthedocs.io/en/stable/api.html#natsort.ns.",
        )
        raise LexosException(" ".join(err))
    return True

rendering:
  show_root_heading: true
  heading_level: 3

KWIC¤

Kwic pydantic-model ¤

alg: Optional[ns] = ns.LOCALE pydantic-field ¤

__init__(**data) ¤

nlp: Optional[str] pydantic-field ¤

alg: Optional[ns] = ns.LOCALE pydantic-field ¤

model_config = ConfigDict(arbitrary_types_allowed=True) class-attribute instance-attribute ¤

__init__(**data) ¤

_convert_patterns_to_spacy(patterns: list, case_sensitive: bool, use_regex: bool) -> list ¤

_match_strings(docs: list[str], labels: list[str], patterns: list, window: int, case_sensitive: bool) ¤

_match_tokens(docs: list[Doc], labels: list[str], window: int, matcher: Matcher) -> list[tuple[str, str, str, str]] ¤

_validate_sorting_algorithm() -> bool ¤

`Kwic` `pydantic-model` ¤

`alg: Optional[ns] = ns.LOCALE` `pydantic-field` ¤

`init(**data)` ¤

`nlp: Optional[str]` `pydantic-field` ¤

`alg: Optional[ns] = ns.LOCALE` `pydantic-field` ¤

`model_config = ConfigDict(arbitrary_types_allowed=True)` `class-attribute` `instance-attribute` ¤

`init(**data)` ¤

`_convert_patterns_to_spacy(patterns: list, case_sensitive: bool, use_regex: bool) -> list` ¤

`_match_strings(docs: list[str], labels: list[str], patterns: list, window: int, case_sensitive: bool)` ¤

`_match_tokens(docs: list[Doc], labels: list[str], window: int, matcher: Matcher) -> list[tuple[str, str, str, str]]` ¤

`_validate_sorting_algorithm() -> bool` ¤