Skip to content

Resources¤

Mappings for removing or transforming character patterns.

HTMLTextExtractor ¤

Bases: HTMLParser

Simple subclass of :class:html.parser.HTMLParser.

Collects data elements (non-tag, -comment, -pi, etc. elements) fed to the parser, then make them available as stripped, concatenated text via HTMLTextExtractor.get_text().

Note

Users probably shouldn't deal with this class directly; instead, use :func:remove.remove_html_tags()`.

Methods:

Name Description
__init__

Initialize the parser.

get_text

Return the collected text.

handle_data

Handle data elements.

Source code in lexos/scrubber/resources.py
class HTMLTextExtractor(html.parser.HTMLParser):
    """Simple subclass of :class:`html.parser.HTMLParser`.

    Collects data elements (non-tag, -comment, -pi, etc. elements)
    fed to the parser, then make them available as stripped, concatenated
    text via `HTMLTextExtractor.get_text()`.

    Note:
        Users probably shouldn't deal with this class directly;
        instead, use `:func:`remove.remove_html_tags()`.
    """

    def __init__(self):
        """Initialize the parser."""
        super().__init__()
        self.data = []

    def handle_data(self, data: Any) -> None:
        """Handle data elements.

        Args:
            data (Any): The data element(s) to handle.
        """
        self.data.append(data)

    def get_text(self, sep: Optional[str] = "") -> str:
        """Return the collected text.

        Args:
            sep (Optional[str]): The separator to join the collected text with.

        Returns:
            str: The collected text.
        """
        return sep.join(self.data).strip()

__init__() ¤

Initialize the parser.

Source code in lexos/scrubber/resources.py
def __init__(self):
    """Initialize the parser."""
    super().__init__()
    self.data = []

get_text(sep: Optional[str] = '') -> str ¤

Return the collected text.

Parameters:

Name Type Description Default
sep Optional[str]

The separator to join the collected text with.

''

Returns:

Name Type Description
str str

The collected text.

Source code in lexos/scrubber/resources.py
def get_text(self, sep: Optional[str] = "") -> str:
    """Return the collected text.

    Args:
        sep (Optional[str]): The separator to join the collected text with.

    Returns:
        str: The collected text.
    """
    return sep.join(self.data).strip()

handle_data(data: Any) -> None ¤

Handle data elements.

Parameters:

Name Type Description Default
data Any

The data element(s) to handle.

required
Source code in lexos/scrubber/resources.py
def handle_data(self, data: Any) -> None:
    """Handle data elements.

    Args:
        data (Any): The data element(s) to handle.
    """
    self.data.append(data)

RE_LINEBREAK: Pattern = re.compile('(\\r\\n|[\\n\\v])+') module-attribute ¤

RE_NONBREAKING_SPACE: Pattern = re.compile('[^\\S\\n\\v]+') module-attribute ¤

RE_ZWSP: Pattern = re.compile('[\\u200B\\u2060\\uFEFF]+') module-attribute ¤

RE_TAB: Pattern = re.compile('[\\t\\v]+') module-attribute ¤

RE_BRACKETS_CURLY = re.compile('\\{[^{}]*?\\}') module-attribute ¤

RE_BRACKETS_ROUND = re.compile('\\([^()]*?\\)') module-attribute ¤

RE_BRACKETS_SQUARE = re.compile('\\[[^\\[\\]]*?\\]') module-attribute ¤

RE_BULLET_POINTS = re.compile('((^|\\n)\\s*?)([\\u2022\\u2023\\u2043\\u204C\\u204D\\u2219\\u25aa\\u25CF\\u25E6\\u29BE\\u29BF\\u30fb])') module-attribute ¤

RE_URL: Pattern = re.compile('(?:^|(?<![\\w/.]))(?:(?:https?://|ftp://|www\\d{0,3}\\.))(?:\\S+(?::\\S*)?@)?(?:(?!(?:10|127)(?:\\.\\d{1,3}){3})(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\\d{2,5})?(?:/\\S*)?(?:$|(?![\\w?!+&/]))', flags=(re.IGNORECASE)) module-attribute ¤