Implement parsing of HTTP requests

This commit is contained in:
Kovid Goyal 2015-05-17 18:38:33 +05:30
parent 1617721b99
commit 656d0a1c10
2 changed files with 246 additions and 3 deletions

View File

@ -6,19 +6,132 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import httplib, socket
import httplib, socket, re
from urllib import unquote
from urlparse import parse_qs
from functools import partial
from calibre import as_unicode
from calibre.srv.errors import MaxSizeExceeded, NonHTTPConnRequest
HTTP1 = 'HTTP/1.0'
HTTP11 = 'HTTP/1.1'
protocol_map = {(1, 0):HTTP1, (1, 1):HTTP11}
quoted_slash = re.compile(br'%2[fF]')
def parse_request_uri(uri): # {{{
"""Parse a Request-URI into (scheme, authority, path).
Note that Request-URI's must be one of::
Request-URI = "*" | absoluteURI | abs_path | authority
Therefore, a Request-URI which starts with a double forward-slash
cannot be a "net_path"::
net_path = "//" authority [ abs_path ]
Instead, it must be interpreted as an "abs_path" with an empty first
path segment::
abs_path = "/" path_segments
path_segments = segment *( "/" segment )
segment = *pchar *( ";" param )
param = *pchar
"""
if uri == b'*':
return None, None, uri
i = uri.find(b'://')
if i > 0 and b'?' not in uri[:i]:
# An absoluteURI.
# If there's a scheme (and it must be http or https), then:
# http_URL = "http:" "//" host [ ":" port ] [ abs_path [ "?" query
# ]]
scheme, remainder = uri[:i].lower(), uri[i + 3:]
authority, path = remainder.split(b'/', 1)
path = b'/' + path
return scheme, authority, path
if uri.startswith(b'/'):
# An abs_path.
return None, None, uri
else:
# An authority.
return None, uri, None
# }}}
comma_separated_headers = {
b'Accept', b'Accept-Charset', b'Accept-Encoding',
b'Accept-Language', b'Accept-Ranges', b'Allow', b'Cache-Control',
b'Connection', b'Content-Encoding', b'Content-Language', b'Expect',
b'If-Match', b'If-None-Match', b'Pragma', b'Proxy-Authenticate', b'TE',
b'Trailer', b'Transfer-Encoding', b'Upgrade', b'Vary', b'Via', b'Warning',
b'WWW-Authenticate'
}
def read_headers(readline, max_line_size, hdict=None): # {{{
"""
Read headers from the given stream into the given header dict.
If hdict is None, a new header dict is created. Returns the populated
header dict.
Headers which are repeated are folded together using a comma if their
specification so dictates.
This function raises ValueError when the read bytes violate the HTTP spec.
You should probably return "400 Bad Request" if this happens.
"""
if hdict is None:
hdict = {}
while True:
line = readline()
if not line:
# No more data--illegal end of headers
raise ValueError("Illegal end of headers.")
if line == b'\r\n':
# Normal end of headers
break
if not line.endswith(b'\r\n'):
raise ValueError("HTTP requires CRLF terminators")
if line[0] in (b' ', b'\t'):
# It's a continuation line.
v = line.strip()
else:
try:
k, v = line.split(b':', 1)
except ValueError:
raise ValueError("Illegal header line.")
k = k.strip().title()
v = v.strip()
hname = k.decode('ascii')
if k in comma_separated_headers:
existing = hdict.get(hname)
if existing:
v = b", ".join((existing, v))
try:
v = v.decode('ascii')
except UnicodeDecodeError:
if hname in 'Transfer-Encoding Connection Keep-Alive Expect':
raise
hdict[hname] = v
return hdict
# }}}
def http_communicate(conn):
' Represents interaction with a http client over a single, persistent connection '
request_seen = False
try:
while True:
# (re)set req to None so that if something goes wrong in
# the RequestHandlerClass constructor, the error doesn't
# the HTTPPair constructor, the error doesn't
# get written to the previous request.
req = None
req = conn.server_loop.http_handler(conn)
@ -60,7 +173,8 @@ class HTTPPair(object):
def __init__(self, conn):
self.conn = conn
self.server_loop = conn.server_loop
self.scheme = b'http' if self.server_loop.ssl_context is None else b'https'
self.max_header_line_size = self.server_loop.max_header_line_size
self.scheme = 'http' if self.server_loop.ssl_context is None else 'https'
self.inheaders = {}
self.outheaders = []
@ -103,6 +217,127 @@ class HTTPPair(object):
self.ready = True
def read_request_line(self):
request_line = self.conn.socket_file.readline(maxsize=self.max_header_line_size)
# Set started_request to True so http_communicate() knows to send 408
# from here on out.
self.started_request = True
if not request_line:
return False
if request_line == b'\r\n':
# RFC 2616 sec 4.1: "...if the server is reading the protocol
# stream at the beginning of a message and receives a CRLF
# first, it should ignore the CRLF."
# But only ignore one leading line! else we enable a DoS.
request_line = self.conn.socket_file.readline(maxsize=self.max_header_line_size)
if not request_line:
return False
if not request_line.endswith(b'\r\n'):
self.simple_response(
httplib.BAD_REQUEST, 'Bad Request', "HTTP requires CRLF terminators")
return False
try:
method, uri, req_protocol = request_line.strip().split(b' ', 2)
rp = int(req_protocol[5]), int(req_protocol[7])
self.method = method.decode('ascii')
except (ValueError, IndexError):
self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Malformed Request-Line")
return False
try:
self.request_protocol = protocol_map[rp]
except KeyError:
self.simple_response(httplib.HTTP_VERSION_NOT_SUPPORTED, "HTTP Version Not Supported")
return False
scheme, authority, path = parse_request_uri(uri)
if b'#' in path:
self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Illegal #fragment in Request-URI.")
return False
if scheme:
try:
self.scheme = scheme.decode('ascii')
except ValueError:
self.simple_response(httplib.BAD_REQUEST, "Bad Request", 'Un-decodeable scheme')
return False
qs = b''
if b'?' in path:
path, qs = path.split(b'?', 1)
try:
self.qs = {k.decode('utf-8'):tuple(x.decode('utf-8') for x in v) for k, v in parse_qs(qs, keep_blank_values=True).iteritems()}
except Exception:
self.simple_response(httplib.BAD_REQUEST, "Bad Request", "Malformed Request-Line",
'Unparseable query string')
return False
try:
path = '%2F'.join(unquote(x).decode('utf-8') for x in quoted_slash.split(path))
except ValueError as e:
self.simple_response(httplib.BAD_REQUEST, "Bad Request", as_unicode(e))
return False
self.path = tuple(x.replace('%2F', '/') for x in path.split('/'))
self.response_protocol = protocol_map[min((1, 1), rp)]
return True
def read_request_headers(self):
# then all the http headers
try:
read_headers(partial(self.conn.socket_file.readline, maxsize=self.max_header_line_size), self.inheaders)
content_length = int(self.inheaders.get('Content-Length', 0))
except ValueError as e:
self.simple_response(httplib.BAD_REQUEST, "Bad Request", as_unicode(e))
return False
if content_length > self.server_loop.max_request_body_size:
self.simple_response(
httplib.REQUEST_ENTITY_TOO_LARGE, "Request Entity Too Large",
"The entity sent with the request exceeds the maximum "
"allowed bytes (%d)." % self.server_loop.max_request_body_size)
return False
# Persistent connection support
if self.response_protocol is HTTP11:
# Both server and client are HTTP/1.1
if self.inheaders.get("Connection", "") == "close":
self.close_connection = True
else:
# Either the server or client (or both) are HTTP/1.0
if self.inheaders.get("Connection", "") != "Keep-Alive":
self.close_connection = True
# Transfer-Encoding support
te = ()
if self.response_protocol is HTTP11:
rte = self.inheaders.get("Transfer-Encoding")
if rte:
te = [x.strip().lower() for x in rte.split(",") if x.strip()]
self.chunked_read = False
if te:
for enc in te:
if enc == "chunked":
self.chunked_read = True
else:
# Note that, even if we see "chunked", we must reject
# if there is an extension we don't recognize.
self.simple_response(httplib.NOT_IMPLEMENTED, "Not Implemented", "Unknown transfer encoding: %s" % enc)
self.close_connection = True
return False
if self.inheaders.get("Expect", '').lower() == "100-continue":
# Don't use simple_response here, because it emits headers
# we don't want.
msg = HTTP11 + " 100 Continue\r\n\r\n"
self.flushed_write(msg.encode('ascii'))
return True
def simple_response(self, status_code, status_text, msg=""):
abort = status_code in (httplib.REQUEST_ENTITY_TOO_LARGE, httplib.REQUEST_URI_TOO_LONG)
if abort:

View File

@ -546,6 +546,12 @@ class ServerLoop(object):
# socket activation
allow_socket_preallocation=True,
# Max. size of single header
max_header_line_size=8192, # 8 KB
# Max. size of a request
max_request_body_size=500 * 1024 * 1024,
# no_delay turns on TCP_NODELAY which decreases latency at the cost of
# worse overall performance when sending multiple small packets. It
# prevents the TCP stack from aggregating multiple small TCP packets.
@ -568,6 +574,8 @@ class ServerLoop(object):
self.no_delay = no_delay
self.request_queue_size = request_queue_size
self.timeout = timeout
self.max_header_line_size = max_header_line_size
self.max_request_body_size = max_request_body_size
self.shutdown_timeout = shutdown_timeout
ba = bind_address
if not isinstance(ba, basestring):