mureq.py

"""
mureq is a replacement for python-requests, intended to be vendored
in-tree by Linux systems software and other lightweight applications.

mureq is copyright 2021 by its contributors and is released under the
0BSD ("zero-clause BSD") license.
"""
import contextlib
import io
import os.path
import socket
import ssl
import sys
import urllib.parse
from http.client import HTTPConnection, HTTPSConnection, HTTPMessage, HTTPException

__version__ = '0.2.0'

__all__ = ['HTTPException', 'TooManyRedirects', 'Response',
           'yield_response', 'request', 'get', 'post', 'head', 'put', 'patch', 'delete']

DEFAULT_TIMEOUT = 15.0

# e.g. "Python 3.8.10"
DEFAULT_UA = "Python " + sys.version.split()[0]


def request(method, url, *, read_limit=None, **kwargs):
    """request performs an HTTP request and reads the entire response body.

    :param str method: HTTP method to request (e.g. 'GET', 'POST')
    :param str url: URL to request
    :param read_limit: maximum number of bytes to read from the body, or None for no limit
    :type read_limit: int or None
    :param kwargs: optional arguments defined by yield_response
    :return: Response object
    :rtype: Response
    :raises: HTTPException
    """
    with yield_response(method, url, **kwargs) as response:
        try:
            body = response.read(read_limit)
        except HTTPException:
            raise
        except IOError as e:
            raise HTTPException(str(e)) from e
        return Response(response.url, response.status, _prepare_incoming_headers(response.headers), body)


def get(url, **kwargs):
    """get performs an HTTP GET request."""
    return request('GET', url=url, **kwargs)


def post(url, body=None, **kwargs):
    """post performs an HTTP POST request."""
    return request('POST', url=url, body=body, **kwargs)


def head(url, **kwargs):
    """head performs an HTTP HEAD request."""
    return request('HEAD', url=url, **kwargs)


def put(url, body=None, **kwargs):
    """put performs an HTTP PUT request."""
    return request('PUT', url=url, body=body, **kwargs)


def patch(url, body=None, **kwargs):
    """patch performs an HTTP PATCH request."""
    return request('PATCH', url=url, body=body, **kwargs)


def delete(url, **kwargs):
    """delete performs an HTTP DELETE request."""
    return request('DELETE', url=url, **kwargs)


@contextlib.contextmanager
def yield_response(method, url, *, unix_socket=None, timeout=DEFAULT_TIMEOUT, headers=None,
                   params=None, body=None, form=None, json=None, verify=True, source_address=None,
                   max_redirects=None, ssl_context=None):
    """yield_response is a low-level API that exposes the actual
    http.client.HTTPResponse via a contextmanager.

    Note that unlike mureq.Response, http.client.HTTPResponse does not
    automatically canonicalize multiple appearances of the same header by
    joining them together with a comma delimiter. To retrieve canonicalized
    headers from the response, use response.getheader():
    https://docs.python.org/3/library/http.client.html#http.client.HTTPResponse.getheader

    :param str method: HTTP method to request (e.g. 'GET', 'POST')
    :param str url: URL to request
    :param unix_socket: path to Unix domain socket to query, or None for a normal TCP request
    :type unix_socket: str or None
    :param timeout: timeout in seconds, or None for no timeout (default: 15 seconds)
    :type timeout: float or None
    :param headers: HTTP headers as a mapping or list of key-value pairs
    :param params: parameters to be URL-encoded and added to the query string, as a mapping or list of key-value pairs
    :param body: payload body of the request
    :type body: bytes or None
    :param form: parameters to be form-encoded and sent as the payload body, as a mapping or list of key-value pairs
    :param json: object to be serialized as JSON and sent as the payload body
    :param bool verify: whether to verify TLS certificates (default: True)
    :param source_address: source address to bind to for TCP
    :type source_address: str or tuple(str, int) or None
    :param max_redirects: maximum number of redirects to follow, or None (the default) for no redirection
    :type max_redirects: int or None
    :param ssl_context: TLS config to control certificate validation, or None for default behavior
    :type ssl_context: ssl.SSLContext or None
    :return: http.client.HTTPResponse, yielded as context manager
    :rtype: http.client.HTTPResponse
    :raises: HTTPException
    """
    method = method.upper()
    headers = _prepare_outgoing_headers(headers)
    enc_params = _prepare_params(params)
    body = _prepare_body(body, form, json, headers)

    visited_urls = []

    while max_redirects is None or len(visited_urls) <= max_redirects:
        url, conn, path = _prepare_request(method, url, enc_params=enc_params, timeout=timeout, unix_socket=unix_socket, verify=verify, source_address=source_address, ssl_context=ssl_context)
        enc_params = ''  # don't reappend enc_params if we get redirected
        visited_urls.append(url)
        try:
            try:
                conn.request(method, path, headers=headers, body=body)
                response = conn.getresponse()
            except HTTPException:
                raise
            except IOError as e:
                # wrap any IOError that is not already an HTTPException
                # in HTTPException, exposing a uniform API for remote errors
                raise HTTPException(str(e)) from e
            redirect_url = _check_redirect(url, response.status, response.headers)
            if max_redirects is None or redirect_url is None:
                response.url = url  # https://bugs.python.org/issue42062
                yield response
                return
            else:
                url = redirect_url
                if response.status == 303:
                    # 303 See Other: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/303
                    method = 'GET'
        finally:
            conn.close()

    raise TooManyRedirects(visited_urls)


class Response:
    """Response contains a completely consumed HTTP response.

    :ivar str url: the retrieved URL, indicating whether a redirection occurred
    :ivar int status_code: the HTTP status code
    :ivar http.client.HTTPMessage headers: the HTTP headers
    :ivar bytes body: the payload body of the response
    """

    __slots__ = ('url', 'status_code', 'headers', 'body')

    def __init__(self, url, status_code, headers, body):
        self.url, self.status_code, self.headers, self.body = url, status_code, headers, body

    def __repr__(self):
        return f"Response(status_code={self.status_code:d})"

    @property
    def ok(self):
        """ok returns whether the response had a successful status code
        (anything other than a 40x or 50x)."""
        return not (400 <= self.status_code < 600)

    @property
    def content(self):
        """content returns the response body (the `body` member). This is an
        alias for compatibility with requests.Response."""
        return self.body

    def raise_for_status(self):
        """raise_for_status checks the response's success code, raising an
        exception for error codes."""
        if not self.ok:
            raise HTTPErrorStatus(self.status_code)

    def json(self):
        """Attempts to deserialize the response body as UTF-8 encoded JSON."""
        import json as jsonlib
        return jsonlib.loads(self.body)

    def _debugstr(self):
        buf = io.StringIO()
        print("HTTP", self.status_code, file=buf)
        for k, v in self.headers.items():
            print(f"{k}: {v}", file=buf)
        print(file=buf)
        try:
            print(self.body.decode('utf-8'), file=buf)
        except UnicodeDecodeError:
            print(f"<{len(self.body)} bytes binary data>", file=buf)
        return buf.getvalue()


class TooManyRedirects(HTTPException):
    """TooManyRedirects is raised when automatic following of redirects was
    enabled, but the server redirected too many times without completing."""
    pass


class HTTPErrorStatus(HTTPException):
    """HTTPErrorStatus is raised by Response.raise_for_status() to indicate an
    HTTP error code (a 40x or a 50x). Note that a well-formed response with an
    error code does not result in an exception unless raise_for_status() is
    called explicitly.
    """

    def __init__(self, status_code):
        self.status_code = status_code

    def __str__(self):
        return f"HTTP response returned error code {self.status_code:d}"


# end public API, begin internal implementation details

_JSON_CONTENTTYPE = 'application/json'
_FORM_CONTENTTYPE = 'application/x-www-form-urlencoded'


class UnixHTTPConnection(HTTPConnection):
    """UnixHTTPConnection is a subclass of HTTPConnection that connects to a
    Unix domain stream socket instead of a TCP address.
    """

    def __init__(self, path, timeout=DEFAULT_TIMEOUT):
        super(UnixHTTPConnection, self).__init__('localhost', timeout=timeout)
        self._unix_path = path

    def connect(self):
        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        try:
            sock.settimeout(self.timeout)
            sock.connect(self._unix_path)
        except Exception:
            sock.close()
            raise
        self.sock = sock


def _check_redirect(url, status, response_headers):
    """Return the URL to redirect to, or None for no redirection."""
    if status not in (301, 302, 303, 307, 308):
        return None
    location = response_headers.get('Location')
    if not location:
        return None
    parsed_location = urllib.parse.urlparse(location)
    if parsed_location.scheme:
        # absolute URL
        return location

    old_url = urllib.parse.urlparse(url)
    if location.startswith('/'):
        # absolute path on old hostname
        return urllib.parse.urlunparse((old_url.scheme, old_url.netloc,
                                        parsed_location.path, parsed_location.params,
                                        parsed_location.query, parsed_location.fragment))

    # relative path on old hostname
    old_dir, _old_file = os.path.split(old_url.path)
    new_path = os.path.join(old_dir, location)
    return urllib.parse.urlunparse((old_url.scheme, old_url.netloc,
                                    new_path, parsed_location.params,
                                    parsed_location.query, parsed_location.fragment))


def _prepare_outgoing_headers(headers):
    if headers is None:
        headers = HTTPMessage()
    elif not isinstance(headers, HTTPMessage):
        new_headers = HTTPMessage()
        if hasattr(headers, 'items'):
            iterator = headers.items()
        else:
            iterator = iter(headers)
        for k, v in iterator:
            new_headers[k] = v
        headers = new_headers
    _setdefault_header(headers, 'User-Agent', DEFAULT_UA)
    return headers


# XXX join multi-headers together so that get(), __getitem__(),
# etc. behave intuitively, then stuff them back in an HTTPMessage.
def _prepare_incoming_headers(headers):
    headers_dict = {}
    for k, v in headers.items():
        headers_dict.setdefault(k, []).append(v)
    result = HTTPMessage()
    # note that iterating over headers_dict preserves the original
    # insertion order in all versions since Python 3.6:
    for k, vlist in headers_dict.items():
        result[k] = ','.join(vlist)
    return result


def _setdefault_header(headers, name, value):
    if name not in headers:
        headers[name] = value


def _prepare_body(body, form, json, headers):
    if body is not None:
        if not isinstance(body, bytes):
            raise TypeError('body must be bytes or None', type(body))
        return body

    if json is not None:
        _setdefault_header(headers, 'Content-Type', _JSON_CONTENTTYPE)
        import json as jsonlib
        return jsonlib.dumps(json).encode('utf-8')

    if form is not None:
        _setdefault_header(headers, 'Content-Type', _FORM_CONTENTTYPE)
        return urllib.parse.urlencode(form, doseq=True)

    return None


def _prepare_params(params):
    if params is None:
        return ''
    return urllib.parse.urlencode(params, doseq=True)


def _prepare_request(method, url, *, enc_params='', timeout=DEFAULT_TIMEOUT, source_address=None, unix_socket=None, verify=True, ssl_context=None):
    """Parses the URL, returns the path and the right HTTPConnection subclass."""
    parsed_url = urllib.parse.urlparse(url)

    is_unix = (unix_socket is not None)
    scheme = parsed_url.scheme.lower()
    if scheme.endswith('+unix'):
        scheme = scheme[:-5]
        is_unix = True
        if scheme == 'https':
            raise ValueError("https+unix is not implemented")

    if scheme not in ('http', 'https'):
        raise ValueError("unrecognized scheme", scheme)

    is_https = (scheme == 'https')
    host = parsed_url.hostname
    port = 443 if is_https else 80
    if parsed_url.port:
        port = parsed_url.port

    if is_unix and unix_socket is None:
        unix_socket = urllib.parse.unquote(parsed_url.netloc)

    path = parsed_url.path
    if parsed_url.query:
        if enc_params:
            path = f'{path}?{parsed_url.query}&{enc_params}'
        else:
            path = f'{path}?{parsed_url.query}'
    else:
        if enc_params:
            path = f'{path}?{enc_params}'
        else:
            pass  # just parsed_url.path in this case

    if isinstance(source_address, str):
        source_address = (source_address, 0)

    if is_unix:
        conn = UnixHTTPConnection(unix_socket, timeout=timeout)
    elif is_https:
        if ssl_context is None:
            ssl_context = ssl.create_default_context()
            if not verify:
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
        conn = HTTPSConnection(host, port, source_address=source_address, timeout=timeout,
                               context=ssl_context)
    else:
        conn = HTTPConnection(host, port, source_address=source_address, timeout=timeout)

    munged_url = urllib.parse.urlunparse((parsed_url.scheme, parsed_url.netloc,
                                          path, parsed_url.params,
                                          '', parsed_url.fragment))
    return munged_url, conn, path