diff --git a/CHANGES.rst b/CHANGES.rst index 7deea85304..3306e8e7dd 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,6 +10,15 @@ https://github.com/zopefoundation/Zope/blob/4.x/CHANGES.rst 5.8.1 (unreleased) ------------------ +- Replace ``cgi.FieldStorage`` by ``multipart`` avoiding + the ``cgi`` module deprecated by Python 3.11. + + Mark binary converters with a true ``binary`` attribute. + + Fix encoding handling and ``:bytes`` converter. + + See `#1094 `_. + - Clean out and refactor dependency configuration files. - Update to newest compatible versions of dependencies. diff --git a/setup.py b/setup.py index 40e9f746f3..8b15382c37 100644 --- a/setup.py +++ b/setup.py @@ -122,6 +122,7 @@ def _read_file(filename): 'zope.testing', 'zope.traversing', 'zope.viewlet', + 'multipart', ], include_package_data=True, zip_safe=False, diff --git a/src/ZPublisher/Converters.py b/src/ZPublisher/Converters.py index 6ce74559df..df8845a3f3 100644 --- a/src/ZPublisher/Converters.py +++ b/src/ZPublisher/Converters.py @@ -11,6 +11,17 @@ # ############################################################################## +"""Converters + +Used by `ZPublisher.HTTPRequest` and `OFS.PropertyManager`. + +Binary converters (i.e. converters which use `bytes` for/in their result) +are marked with a true `binary` attribute`. +This allows the publisher to perform the conversion to `bytes` +based on its more precise encoding knowledge. +""" + + import html import json import re @@ -42,6 +53,9 @@ def field2bytes(v): return bytes(v) +field2bytes.binary = True + + def field2text(value, nl=re.compile('\r\n|\n\r').search): value = field2string(value) match_object = nl(value) diff --git a/src/ZPublisher/HTTPRequest.py b/src/ZPublisher/HTTPRequest.py index d767941d2d..1c13ca51d3 100644 --- a/src/ZPublisher/HTTPRequest.py +++ b/src/ZPublisher/HTTPRequest.py @@ -20,13 +20,18 @@ import random import re import time -from cgi import FieldStorage from copy import deepcopy +from types import SimpleNamespace +from urllib.parse import parse_qsl from urllib.parse import unquote from urllib.parse import urlparse from AccessControl.tainted import should_be_tainted from AccessControl.tainted import taint_string +from multipart import Headers +from multipart import MultipartParser +from multipart import parse_options_header +from zExceptions import BadRequest from zope.component import queryUtility from zope.i18n.interfaces import IUserPreferredLanguages from zope.i18n.locales import LoadLocaleError @@ -47,6 +52,13 @@ from .cookie import getCookieValuePolicy +# DOS attack protection -- limiting the amount of memory for forms +# probably should become configurable +FORM_MEMORY_LIMIT = 2 ** 20 # memory limit for forms +FORM_DISK_LIMIT = 2 ** 30 # disk limit for forms +FORM_MEMFILE_LIMIT = 4000 # limit for `BytesIO` -> temporary file switch + + # This may get overwritten during configuration default_encoding = 'utf-8' @@ -494,17 +506,14 @@ def processInputs( environ['QUERY_STRING'] = '' meth = None - fs_kw = {} - fs_kw['encoding'] = self.charset - fs = ZopeFieldStorage( - fp=fp, environ=environ, keep_blank_values=1, **fs_kw) + fs = ZopeFieldStorage(fp, environ) # Keep a reference to the FieldStorage. Otherwise it's # __del__ method is called too early and closing FieldStorage.file. self._hold(fs) - if not hasattr(fs, 'list') or fs.list is None: + if not hasattr(fs, 'list'): if 'HTTP_SOAPACTION' in environ: # Stash XML request for interpretation by a SOAP-aware view other['SOAPXML'] = fs.value @@ -525,24 +534,55 @@ def processInputs( tainteddefaults = {} converter = None - for item in fslist: - + for item in fslist: # form data + # Note: + # we want to support 2 use cases + # 1. the form data has been created by the browser + # 2. the form data is free standing + # A browser internally works with character data, + # which it encodes for transmission to the server -- + # usually with `self.charset`. Therefore, we + # usually expect the form data to represent data + # in this charset. + # We make this same assumption also for free standing + # form data, i.e. we expect the form creator to know + # the server's charset. However, sometimes data cannot + # be represented in this charset (e.g. arbitrary binary + # data). To cover this case, we decode data + # with the `surrogateescape` error handler (see PEP 383). + # It allows to retrieve the original byte sequence. + # With an encoding modifier, the form creator + # can specify the correct encoding used by a form field value. + # Note: we always expect the form field name + # to be representable with `self.charset`. As those + # names are expected to be `ASCII`, this should be no + # big restriction. + # Note: the use of `surrogateescape` can lead to delayed + # problems when surrogates reach the application because + # they cannot be encoded with a standard error handler. + # We might want to prevent this. + character_encoding = '' # currently used encoding isFileUpload = 0 key = item.name if key is None: continue + key = item.name.encode("latin-1").decode(self.charset) if hasattr(item, 'file') and \ hasattr(item, 'filename') and \ hasattr(item, 'headers'): - if item.file and item.filename is not None: - item = FileUpload(item) - isFileUpload = 1 - else: - item = item.value + item = FileUpload(item, self.charset) + isFileUpload = 1 + else: + character_encoding = self.charset + item = item.value.decode( + character_encoding, "surrogateescape") + # from here on, `item` contains the field value + # either as `FileUpload` or `str` with + # `character_encoding` as encoding. + # `key` the field name (`str`) flags = 0 - character_encoding = '' # Variables for potentially unsafe values. tainted = None converter_type = None @@ -599,7 +639,15 @@ def processInputs( if not item: flags = flags | EMPTY elif has_codec(type_name): + # recode: + assert not isFileUpload, "cannot recode files" + item = item.encode( + character_encoding, "surrogateescape") character_encoding = type_name + # we do not use `surrogateescape` as + # we immediately want to determine + # an incompatible encoding modifier + item = item.decode(character_encoding) delim = key.rfind(':') if delim < 0: @@ -644,20 +692,11 @@ def processInputs( # defer conversion if flags & CONVERTED: try: - if character_encoding: - # We have a string with a specified character - # encoding. This gets passed to the converter - # either as unicode, if it can handle it, or - # crunched back down to utf-8 if it can not. - if isinstance(item, bytes): - item = str(item, character_encoding) - if hasattr(converter, 'convert_unicode'): - item = converter.convert_unicode(item) - else: - item = converter( - item.encode(default_encoding)) - else: - item = converter(item) + if character_encoding and \ + getattr(converter, "binary", False): + item = item.encode(character_encoding, + "surrogateescape") + item = converter(item) # Flag potentially unsafe values if converter_type in ('string', 'required', 'text', @@ -1632,15 +1671,117 @@ def sane_environment(env): return dict -class ZopeFieldStorage(FieldStorage): - """This subclass exists to work around a Python bug - (see https://bugs.python.org/issue27777) to make sure - we can read binary data from a request body. +class ValueDescriptor: + """(non data) descriptor to compute `value` from `file`.""" + def __get__(self, inst, owner=None): + if inst is None: + return self + file = inst.file + try: + fpos = file.tell() + except Exception: + fpos = None + try: + return file.read() + finally: + if fpos is not None: + file.seek(fpos) + + +class ValueAccessor: + value = ValueDescriptor() + + +class FormField(SimpleNamespace, ValueAccessor): + """represent a single form field. + + Typical attributes: + name + the field name + value + the field value (`bytes`) + + File fields additionally have the attributes: + file + a binary file containing the file content + filename + the file's name as reported by the client + headers + a case insensitive dict with header information; + usually `content-type` and `content-disposition`. + + Unless otherwise noted, `latin-1` decoded bytes + are used to represent textual data. """ - def read_binary(self): - self._binary_file = True - return FieldStorage.read_binary(self) + +class ZopeFieldStorage(ValueAccessor): + def __init__(self, fp, environ): + self.file = fp + method = environ.get("REQUEST_METHOD", "GET").upper() + qs = environ.get("QUERY_STRING", "") + hl = [] + content_type = environ.get("CONTENT_TYPE", + "application/x-www-form-urlencoded") + content_type = content_type + hl.append(("content-type", content_type)) + content_disposition = environ.get("CONTENT_DISPOSITION") + if content_disposition is not None: + hl.append(("content-disposition", content_disposition)) + self.headers = Headers(hl) + parts = () + if method == "POST": + try: + fpos = fp.tell() + except Exception: + fpos = None + if content_type.startswith("multipart/form-data"): + ct, options = parse_options_header(content_type) + parts = MultipartParser( + fp, options["boundary"], + mem_limit=FORM_MEMORY_LIMIT, + disk_limit=FORM_DISK_LIMIT, + memfile_limit=FORM_MEMFILE_LIMIT, + charset="latin-1").parts() + elif content_type == "application/x-www-form-urlencoded": + if qs: + qs += "&" + qs += fp.read(FORM_MEMORY_LIMIT).decode("latin-1") + if fp.read(1): + raise BadRequest("form data processing " + "requires too much memory") + else: + # `processInputs` currently expects either + # form values or a response body, not both. + # reset `qs` to fulfill this expectation. + qs = "" + if fpos is not None: + fp.seek(fpos) + elif method not in ("GET", "HEAD"): + # `processInputs` currently expects either + # form values or a response body, not both. + # reset `qs` to fulfill this expectation. + qs = "" + fl = [] + add_field = fl.append + for name, val in parse_qsl( + qs, # noqa: E121 + keep_blank_values=True, encoding="latin-1"): + add_field(FormField( + name=name, value=val.encode("latin-1"))) + for part in parts: + if part.filename: + # a file + field = FormField( + name=part.name, + file=part.file, + filename=part.filename, + headers=part.headers) + else: + field = FormField(name=part.name, value=part.raw) + add_field(field) + if fl: + self.list = fl # Original version: zope.publisher.browser.FileUpload @@ -1660,11 +1801,12 @@ class FileUpload: # that protected code can use DTML to work with FileUploads. __allow_access_to_unprotected_subobjects__ = 1 - def __init__(self, aFieldStorage): + def __init__(self, aFieldStorage, charset): self.file = aFieldStorage.file self.headers = aFieldStorage.headers - self.filename = aFieldStorage.filename - self.name = aFieldStorage.name + self.filename = aFieldStorage.filename\ + .encode("latin-1").decode(charset) + self.name = aFieldStorage.name.encode("latin-1").decode(charset) # Add an assertion to the rfc822.Message object that implements # self.headers so that managed code can access them. diff --git a/src/ZPublisher/tests/testHTTPRequest.py b/src/ZPublisher/tests/testHTTPRequest.py index acd397f1fd..377eb751e1 100644 --- a/src/ZPublisher/tests/testHTTPRequest.py +++ b/src/ZPublisher/tests/testHTTPRequest.py @@ -16,6 +16,7 @@ import warnings from contextlib import contextmanager from io import BytesIO +from unittest.mock import patch from AccessControl.tainted import should_be_tainted from zExceptions import NotFound @@ -26,6 +27,7 @@ from zope.publisher.browser import BrowserLanguages from zope.publisher.interfaces.http import IHTTPRequest from zope.testing.cleanup import cleanUp +from ZPublisher.HTTPRequest import BadRequest from ZPublisher.HTTPRequest import search_type from ZPublisher.interfaces import IXmlrpcChecker from ZPublisher.tests.testBaseRequest import TestRequestViewsBase @@ -106,8 +108,11 @@ def _getTargetClass(self): from ZPublisher.HTTPRequest import HTTPRequest return HTTPRequest - def _makePostEnviron(self, body=b''): + def _makePostEnviron(self, body=b'', multipart=True): environ = TEST_POST_ENVIRON.copy() + environ["CONTENT_TYPE"] = \ + multipart and 'multipart/form-data; boundary=12345' \ + or 'application/x-www-form-urlencoded' environ['CONTENT_LENGTH'] = str(len(body)) return environ @@ -1299,6 +1304,25 @@ def test_url_scheme(self): req = self._makeOne(environ=env) self.assertEqual(req['SERVER_URL'], 'https://myhost') + def test_form_urlencoded(self): + body = b"a=1" + env = self._makePostEnviron(body, False) + req = self._makeOne(stdin=BytesIO(body), environ=env) + req.processInputs() + self.assertEqual(req.form["a"], "1") + req = self._makeOne(stdin=BytesIO(body), environ=env) + with patch("ZPublisher.HTTPRequest.FORM_MEMORY_LIMIT", 1): + with self.assertRaises(BadRequest): + req.processInputs() + + def test_bytes_converter(self): + val = "äöü".encode("latin-1") + body = b"a:bytes:latin-1=" + val + env = self._makePostEnviron(body, False) + req = self._makeOne(stdin=BytesIO(body), environ=env) + req.processInputs() + self.assertEqual(req.form["a"], val) + class TestHTTPRequestZope3Views(TestRequestViewsBase): @@ -1356,7 +1380,6 @@ def test_special(self): TEST_POST_ENVIRON = { - 'CONTENT_TYPE': 'multipart/form-data; boundary=12345', 'CONTENT_LENGTH': None, 'REQUEST_METHOD': 'POST', 'SERVER_NAME': 'localhost', @@ -1378,4 +1401,7 @@ def test_special(self): Content-Disposition: form-data; name="largefile"; filename="largefile" Content-Type: application/octet-stream -test ''' + (b'test' * 1000) + b'\n\n' +test %s + +--12345-- +''' % (b'test' * 1000) diff --git a/src/ZPublisher/tests/testPostTraversal.py b/src/ZPublisher/tests/testPostTraversal.py index a7c43af9e7..b6c5c806f7 100644 --- a/src/ZPublisher/tests/testPostTraversal.py +++ b/src/ZPublisher/tests/testPostTraversal.py @@ -6,7 +6,11 @@ from ZPublisher.HTTPResponse import HTTPResponse -Zope2.startup_wsgi() +class WSGILayer: + @staticmethod + def setUp(): + Zope2.startup_wsgi() + pt_simple_was_run = 0 @@ -56,6 +60,7 @@ def __before_publishing_traverse__(self, object, REQUEST): class TestBaseRequestPT(TestCase): + layer = WSGILayer def setUp(self): self.root = DummyObjectBasic() diff --git a/versions-prod.cfg b/versions-prod.cfg index ae603b6c7c..e368c066bd 100644 --- a/versions-prod.cfg +++ b/versions-prod.cfg @@ -81,6 +81,7 @@ zope.testbrowser = 5.6.1 zope.testing = 5.0.1 zope.traversing = 4.4.1 zope.viewlet = 4.3 +multipart = 0.2.4 [versions:python37] # PasteDeploy 3.x works on Python 3.7 but pulls tons of dependencies