Source code for libweb.regex

"""Regex Services

This module implements services using HTTP for communication and regular expressions
to parse the results
"""

import itertools
try:
    import html
except ImportError:
    import HTMLParser
    html_unescape = HTMLParser.HTMLParser().unescape  # pylint: disable=invalid-name
else:
    html_unescape = html.unescape  # pylint: disable=invalid-name
import re

from .http import HttpService


[docs]class RegexService(HttpService): """An HTTP-based service that is scraped using regular expressions. Keyword arguments: parse (list): Regular expressions used to parse data from the service """
[docs] def get_html(self): """Make the HTTP request(s) and unescape the returned HTML""" for request in self.make_requests(): yield html_unescape(request.text)
@property def regexes(self): """Compile the regular expressions provided in the configuration""" return [re.compile(regex) for regex in self.conf.get("parse", [])]
[docs] def get_results(self): """Apply the configured regular expressions to the service's response""" for body in self.get_html(): iters = [regex.finditer(body) for regex in self.regexes] try: zip_longest = itertools.zip_longest except AttributeError: # Python 2.7 zip_longest = itertools.izip_longest # pylint: disable=no-member for matches in zip_longest(*iters): yield dict(itertools.chain.from_iterable( [m.groupdict().items() for m in matches if m is not None] ))