Implement parsing of HTTP requests

2025-07-09 03:04:10 -04:00 · 2015-05-17 18:38:33 +05:30 · 2015-05-17 18:38:33 +05:30 · 656d0a1c10
commit 656d0a1c10
parent 1617721b99
2 changed files with 246 additions and 3 deletions
--- a/src/calibre/srv/http.py
+++ b/src/calibre/srv/http.py
@ -6,19 +6,132 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
-import httplib, socket
+import httplib, socket, re
 from urllib import unquote
 from urlparse import parse_qs
 from functools import partial
 from calibre import as_unicode
 from calibre.srv.errors import MaxSizeExceeded, NonHTTPConnRequest
 HTTP1  = 'HTTP/1.0'
 HTTP11 = 'HTTP/1.1'
 protocol_map = {(1, 0):HTTP1, (1, 1):HTTP11}
 quoted_slash = re.compile(br'%2[fF]')
 def parse_request_uri(uri):  # {{{
    """Parse a Request-URI into (scheme, authority, path).
    Note that Request-URI's must be one of::
        Request-URI    = "*" | absoluteURI | abs_path | authority
    Therefore, a Request-URI which starts with a double forward-slash
    cannot be a "net_path"::
        net_path      = "//" authority [ abs_path ]
    Instead, it must be interpreted as an "abs_path" with an empty first
    path segment::
        abs_path      = "/"  path_segments
        path_segments = segment *( "/" segment )
        segment       = *pchar *( ";" param )
        param         = *pchar
    """
    if uri == b'*':
        return None, None, uri
    i = uri.find(b'://')
    if i > 0 and b'?' not in uri[:i]:
        # An absoluteURI.
        # If there's a scheme (and it must be http or https), then:
        # http_URL = "http:" "//" host [ ":" port ] [ abs_path [ "?" query
        # ]]
        scheme, remainder = uri[:i].lower(), uri[i + 3:]
        authority, path = remainder.split(b'/', 1)
        path = b'/' + path
        return scheme, authority, path
    if uri.startswith(b'/'):
        # An abs_path.
        return None, None, uri
    else:
        # An authority.
        return None, uri, None
 # }}}
 comma_separated_headers = {
    b'Accept', b'Accept-Charset', b'Accept-Encoding',
    b'Accept-Language', b'Accept-Ranges', b'Allow', b'Cache-Control',
    b'Connection', b'Content-Encoding', b'Content-Language', b'Expect',
    b'If-Match', b'If-None-Match', b'Pragma', b'Proxy-Authenticate', b'TE',
    b'Trailer', b'Transfer-Encoding', b'Upgrade', b'Vary', b'Via', b'Warning',
    b'WWW-Authenticate'
 }
 def read_headers(readline, max_line_size, hdict=None):  # {{{
    """
    Read headers from the given stream into the given header dict.
    If hdict is None, a new header dict is created. Returns the populated
    header dict.
    Headers which are repeated are folded together using a comma if their
    specification so dictates.
    This function raises ValueError when the read bytes violate the HTTP spec.
    You should probably return "400 Bad Request" if this happens.
    """
    if hdict is None:
        hdict = {}
    while True:
        line = readline()
        if not line:
            # No more data--illegal end of headers
            raise ValueError("Illegal end of headers.")
        if line == b'\r\n':
            # Normal end of headers
            break
        if not line.endswith(b'\r\n'):
            raise ValueError("HTTP requires CRLF terminators")
        if line[0] in (b' ', b'\t'):
            # It's a continuation line.
            v = line.strip()
        else:
            try:
                k, v = line.split(b':', 1)
            except ValueError:
                raise ValueError("Illegal header line.")
            k = k.strip().title()
            v = v.strip()
            hname = k.decode('ascii')
        if k in comma_separated_headers:
            existing = hdict.get(hname)
            if existing:
                v = b", ".join((existing, v))
        try:
            v = v.decode('ascii')
        except UnicodeDecodeError:
            if hname in 'Transfer-Encoding Connection Keep-Alive Expect':
                raise
        hdict[hname] = v
    return hdict
 # }}}
 def http_communicate(conn):
    ' Represents interaction with a http client over a single, persistent connection '
    request_seen = False
    try:
        while True:
            # (re)set req to None so that if something goes wrong in
-            # the RequestHandlerClass constructor, the error doesn't
+            # the HTTPPair constructor, the error doesn't
            # get written to the previous request.
            req = None
            req = conn.server_loop.http_handler(conn)
@ -60,7 +173,8 @@ class HTTPPair(object):
    def __init__(self, conn):
        self.conn = conn
        self.server_loop = conn.server_loop
-        self.scheme = b'http' if self.server_loop.ssl_context is None else b'https'
+        self.max_header_line_size = self.server_loop.max_header_line_size
        self.scheme = 'http' if self.server_loop.ssl_context is None else 'https'
        self.inheaders = {}
        self.outheaders = []
@ -103,6 +217,127 @@ class HTTPPair(object):
        self.ready = True
    def read_request_line(self):
        request_line = self.conn.socket_file.readline(maxsize=self.max_header_line_size)
        # Set started_request to True so http_communicate() knows to send 408
        # from here on out.
        self.started_request = True
        if not request_line:
            return False
        if request_line == b'\r\n':
            # RFC 2616 sec 4.1: "...if the server is reading the protocol
            # stream at the beginning of a message and receives a CRLF
            # first, it should ignore the CRLF."
            # But only ignore one leading line! else we enable a DoS.
            request_line = self.conn.socket_file.readline(maxsize=self.max_header_line_size)
            if not request_line:
                return False
        if not request_line.endswith(b'\r\n'):
            self.simple_response(
                httplib.BAD_REQUEST, 'Bad Request', "HTTP requires CRLF terminators")
            return False
        try:
            method, uri, req_protocol = request_line.strip().split(b' ', 2)
            rp = int(req_protocol[5]), int(req_protocol[7])
            self.method = method.decode('ascii')
        except (ValueError, IndexError):
            self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Malformed Request-Line")
            return False
        try:
            self.request_protocol = protocol_map[rp]
        except KeyError:
            self.simple_response(httplib.HTTP_VERSION_NOT_SUPPORTED, "HTTP Version Not Supported")
            return False
        scheme, authority, path = parse_request_uri(uri)
        if b'#' in path:
            self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Illegal #fragment in Request-URI.")
            return False
        if scheme:
            try:
                self.scheme = scheme.decode('ascii')
            except ValueError:
                self.simple_response(httplib.BAD_REQUEST, "Bad Request", 'Un-decodeable scheme')
                return False
        qs = b''
        if b'?' in path:
            path, qs = path.split(b'?', 1)
            try:
                self.qs = {k.decode('utf-8'):tuple(x.decode('utf-8') for x in v) for k, v in parse_qs(qs, keep_blank_values=True).iteritems()}
            except Exception:
                self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Malformed Request-Line",
                                     'Unparseable query string')
                return False
        try:
            path = '%2F'.join(unquote(x).decode('utf-8') for x in quoted_slash.split(path))
        except ValueError as e:
            self.simple_response(httplib.BAD_REQUEST, "Bad Request", as_unicode(e))
            return False
        self.path = tuple(x.replace('%2F', '/') for x in path.split('/'))
        self.response_protocol = protocol_map[min((1, 1), rp)]
        return True
    def read_request_headers(self):
        # then all the http headers
        try:
            read_headers(partial(self.conn.socket_file.readline, maxsize=self.max_header_line_size), self.inheaders)
            content_length = int(self.inheaders.get('Content-Length', 0))
        except ValueError as e:
            self.simple_response(httplib.BAD_REQUEST, "Bad Request", as_unicode(e))
            return False
        if content_length > self.server_loop.max_request_body_size:
            self.simple_response(
                httplib.REQUEST_ENTITY_TOO_LARGE, "Request Entity Too Large",
                "The entity sent with the request exceeds the maximum "
                "allowed bytes (%d)." % self.server_loop.max_request_body_size)
            return False
        # Persistent connection support
        if self.response_protocol is HTTP11:
            # Both server and client are HTTP/1.1
            if self.inheaders.get("Connection", "") == "close":
                self.close_connection = True
        else:
            # Either the server or client (or both) are HTTP/1.0
            if self.inheaders.get("Connection", "") != "Keep-Alive":
                self.close_connection = True
        # Transfer-Encoding support
        te = ()
        if self.response_protocol is HTTP11:
            rte = self.inheaders.get("Transfer-Encoding")
            if rte:
                te = [x.strip().lower() for x in rte.split(",") if x.strip()]
        self.chunked_read = False
        if te:
            for enc in te:
                if enc == "chunked":
                    self.chunked_read = True
                else:
                    # Note that, even if we see "chunked", we must reject
                    # if there is an extension we don't recognize.
                    self.simple_response(httplib.NOT_IMPLEMENTED, "Not Implemented", "Unknown transfer encoding: %s" % enc)
                    self.close_connection = True
                    return False
        if self.inheaders.get("Expect", '').lower() == "100-continue":
            # Don't use simple_response here, because it emits headers
            # we don't want.
            msg = HTTP11 + " 100 Continue\r\n\r\n"
            self.flushed_write(msg.encode('ascii'))
        return True
    def simple_response(self, status_code, status_text, msg=""):
        abort = status_code in (httplib.REQUEST_ENTITY_TOO_LARGE, httplib.REQUEST_URI_TOO_LONG)
        if abort:
--- a/src/calibre/srv/loop.py
+++ b/src/calibre/srv/loop.py
@ -546,6 +546,12 @@ class ServerLoop(object):
                 # socket activation
                 allow_socket_preallocation=True,
                 # Max. size of single header
                 max_header_line_size=8192,  # 8 KB
                 # Max. size of a request
                 max_request_body_size=500 * 1024 * 1024,
                 # no_delay turns on TCP_NODELAY which decreases latency at the cost of
                 # worse overall performance when sending multiple small packets. It
                 # prevents the TCP stack from aggregating multiple small TCP packets.
@ -568,6 +574,8 @@ class ServerLoop(object):
        self.no_delay = no_delay
        self.request_queue_size = request_queue_size
        self.timeout = timeout
        self.max_header_line_size = max_header_line_size
        self.max_request_body_size = max_request_body_size
        self.shutdown_timeout = shutdown_timeout
        ba = bind_address
        if not isinstance(ba, basestring):