Skip to content

Commit

Permalink
replace cgi.FieldStorage by multipart (#1094)
Browse files Browse the repository at this point in the history
* replace `cgi.FieldStorage` by `multipart`

* add CHANGES.rst log entry [skip ci]

* partially support non seekable input file -- this will not work when the request body is accessed twice; in those cases, the second access will give an empty result

* interpret a missing `CONTENT_TYPE` as `application/x-www-form-urlencoded` (required by some tests)

* better explain "binary converters" [skip ci]

* Update src/ZPublisher/Converters.py

Co-authored-by: Michael Howitz <[email protected]>
  • Loading branch information
d-maurer and Michael Howitz authored Jan 19, 2023
1 parent 3b3c67a commit 5b324f6
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 42 deletions.
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ https://github.com/zopefoundation/Zope/blob/4.x/CHANGES.rst
5.8.1 (unreleased)
------------------

- Replace ``cgi.FieldStorage`` by ``multipart`` avoiding
the ``cgi`` module deprecated by Python 3.11.

Mark binary converters with a true ``binary`` attribute.

Fix encoding handling and ``:bytes`` converter.

See `#1094 <https://github.com/zopefoundation/Zope/pull/1094>`_.

- Clean out and refactor dependency configuration files.

- Update to newest compatible versions of dependencies.
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def _read_file(filename):
'zope.testing',
'zope.traversing',
'zope.viewlet',
'multipart',
],
include_package_data=True,
zip_safe=False,
Expand Down
14 changes: 14 additions & 0 deletions src/ZPublisher/Converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
#
##############################################################################

"""Converters
Used by `ZPublisher.HTTPRequest` and `OFS.PropertyManager`.
Binary converters (i.e. converters which use `bytes` for/in their result)
are marked with a true `binary` attribute`.
This allows the publisher to perform the conversion to `bytes`
based on its more precise encoding knowledge.
"""


import html
import json
import re
Expand Down Expand Up @@ -42,6 +53,9 @@ def field2bytes(v):
return bytes(v)


field2bytes.binary = True


def field2text(value, nl=re.compile('\r\n|\n\r').search):
value = field2string(value)
match_object = nl(value)
Expand Down
218 changes: 180 additions & 38 deletions src/ZPublisher/HTTPRequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@
import random
import re
import time
from cgi import FieldStorage
from copy import deepcopy
from types import SimpleNamespace
from urllib.parse import parse_qsl
from urllib.parse import unquote
from urllib.parse import urlparse

from AccessControl.tainted import should_be_tainted
from AccessControl.tainted import taint_string
from multipart import Headers
from multipart import MultipartParser
from multipart import parse_options_header
from zExceptions import BadRequest
from zope.component import queryUtility
from zope.i18n.interfaces import IUserPreferredLanguages
from zope.i18n.locales import LoadLocaleError
Expand All @@ -47,6 +52,13 @@
from .cookie import getCookieValuePolicy


# DOS attack protection -- limiting the amount of memory for forms
# probably should become configurable
FORM_MEMORY_LIMIT = 2 ** 20 # memory limit for forms
FORM_DISK_LIMIT = 2 ** 30 # disk limit for forms
FORM_MEMFILE_LIMIT = 4000 # limit for `BytesIO` -> temporary file switch


# This may get overwritten during configuration
default_encoding = 'utf-8'

Expand Down Expand Up @@ -494,17 +506,14 @@ def processInputs(
environ['QUERY_STRING'] = ''

meth = None
fs_kw = {}
fs_kw['encoding'] = self.charset

fs = ZopeFieldStorage(
fp=fp, environ=environ, keep_blank_values=1, **fs_kw)
fs = ZopeFieldStorage(fp, environ)

# Keep a reference to the FieldStorage. Otherwise it's
# __del__ method is called too early and closing FieldStorage.file.
self._hold(fs)

if not hasattr(fs, 'list') or fs.list is None:
if not hasattr(fs, 'list'):
if 'HTTP_SOAPACTION' in environ:
# Stash XML request for interpretation by a SOAP-aware view
other['SOAPXML'] = fs.value
Expand All @@ -525,24 +534,55 @@ def processInputs(
tainteddefaults = {}
converter = None

for item in fslist:

for item in fslist: # form data
# Note:
# we want to support 2 use cases
# 1. the form data has been created by the browser
# 2. the form data is free standing
# A browser internally works with character data,
# which it encodes for transmission to the server --
# usually with `self.charset`. Therefore, we
# usually expect the form data to represent data
# in this charset.
# We make this same assumption also for free standing
# form data, i.e. we expect the form creator to know
# the server's charset. However, sometimes data cannot
# be represented in this charset (e.g. arbitrary binary
# data). To cover this case, we decode data
# with the `surrogateescape` error handler (see PEP 383).
# It allows to retrieve the original byte sequence.
# With an encoding modifier, the form creator
# can specify the correct encoding used by a form field value.
# Note: we always expect the form field name
# to be representable with `self.charset`. As those
# names are expected to be `ASCII`, this should be no
# big restriction.
# Note: the use of `surrogateescape` can lead to delayed
# problems when surrogates reach the application because
# they cannot be encoded with a standard error handler.
# We might want to prevent this.
character_encoding = '' # currently used encoding
isFileUpload = 0
key = item.name
if key is None:
continue
key = item.name.encode("latin-1").decode(self.charset)

if hasattr(item, 'file') and \
hasattr(item, 'filename') and \
hasattr(item, 'headers'):
if item.file and item.filename is not None:
item = FileUpload(item)
isFileUpload = 1
else:
item = item.value
item = FileUpload(item, self.charset)
isFileUpload = 1
else:
character_encoding = self.charset
item = item.value.decode(
character_encoding, "surrogateescape")
# from here on, `item` contains the field value
# either as `FileUpload` or `str` with
# `character_encoding` as encoding.
# `key` the field name (`str`)

flags = 0
character_encoding = ''
# Variables for potentially unsafe values.
tainted = None
converter_type = None
Expand Down Expand Up @@ -599,7 +639,15 @@ def processInputs(
if not item:
flags = flags | EMPTY
elif has_codec(type_name):
# recode:
assert not isFileUpload, "cannot recode files"
item = item.encode(
character_encoding, "surrogateescape")
character_encoding = type_name
# we do not use `surrogateescape` as
# we immediately want to determine
# an incompatible encoding modifier
item = item.decode(character_encoding)

delim = key.rfind(':')
if delim < 0:
Expand Down Expand Up @@ -644,20 +692,11 @@ def processInputs(
# defer conversion
if flags & CONVERTED:
try:
if character_encoding:
# We have a string with a specified character
# encoding. This gets passed to the converter
# either as unicode, if it can handle it, or
# crunched back down to utf-8 if it can not.
if isinstance(item, bytes):
item = str(item, character_encoding)
if hasattr(converter, 'convert_unicode'):
item = converter.convert_unicode(item)
else:
item = converter(
item.encode(default_encoding))
else:
item = converter(item)
if character_encoding and \
getattr(converter, "binary", False):
item = item.encode(character_encoding,
"surrogateescape")
item = converter(item)

# Flag potentially unsafe values
if converter_type in ('string', 'required', 'text',
Expand Down Expand Up @@ -1632,15 +1671,117 @@ def sane_environment(env):
return dict


class ZopeFieldStorage(FieldStorage):
"""This subclass exists to work around a Python bug
(see https://bugs.python.org/issue27777) to make sure
we can read binary data from a request body.
class ValueDescriptor:
"""(non data) descriptor to compute `value` from `file`."""
def __get__(self, inst, owner=None):
if inst is None:
return self
file = inst.file
try:
fpos = file.tell()
except Exception:
fpos = None
try:
return file.read()
finally:
if fpos is not None:
file.seek(fpos)


class ValueAccessor:
value = ValueDescriptor()


class FormField(SimpleNamespace, ValueAccessor):
"""represent a single form field.
Typical attributes:
name
the field name
value
the field value (`bytes`)
File fields additionally have the attributes:
file
a binary file containing the file content
filename
the file's name as reported by the client
headers
a case insensitive dict with header information;
usually `content-type` and `content-disposition`.
Unless otherwise noted, `latin-1` decoded bytes
are used to represent textual data.
"""

def read_binary(self):
self._binary_file = True
return FieldStorage.read_binary(self)

class ZopeFieldStorage(ValueAccessor):
def __init__(self, fp, environ):
self.file = fp
method = environ.get("REQUEST_METHOD", "GET").upper()
qs = environ.get("QUERY_STRING", "")
hl = []
content_type = environ.get("CONTENT_TYPE",
"application/x-www-form-urlencoded")
content_type = content_type
hl.append(("content-type", content_type))
content_disposition = environ.get("CONTENT_DISPOSITION")
if content_disposition is not None:
hl.append(("content-disposition", content_disposition))
self.headers = Headers(hl)
parts = ()
if method == "POST":
try:
fpos = fp.tell()
except Exception:
fpos = None
if content_type.startswith("multipart/form-data"):
ct, options = parse_options_header(content_type)
parts = MultipartParser(
fp, options["boundary"],
mem_limit=FORM_MEMORY_LIMIT,
disk_limit=FORM_DISK_LIMIT,
memfile_limit=FORM_MEMFILE_LIMIT,
charset="latin-1").parts()
elif content_type == "application/x-www-form-urlencoded":
if qs:
qs += "&"
qs += fp.read(FORM_MEMORY_LIMIT).decode("latin-1")
if fp.read(1):
raise BadRequest("form data processing "
"requires too much memory")
else:
# `processInputs` currently expects either
# form values or a response body, not both.
# reset `qs` to fulfill this expectation.
qs = ""
if fpos is not None:
fp.seek(fpos)
elif method not in ("GET", "HEAD"):
# `processInputs` currently expects either
# form values or a response body, not both.
# reset `qs` to fulfill this expectation.
qs = ""
fl = []
add_field = fl.append
for name, val in parse_qsl(
qs, # noqa: E121
keep_blank_values=True, encoding="latin-1"):
add_field(FormField(
name=name, value=val.encode("latin-1")))
for part in parts:
if part.filename:
# a file
field = FormField(
name=part.name,
file=part.file,
filename=part.filename,
headers=part.headers)
else:
field = FormField(name=part.name, value=part.raw)
add_field(field)
if fl:
self.list = fl


# Original version: zope.publisher.browser.FileUpload
Expand All @@ -1660,11 +1801,12 @@ class FileUpload:
# that protected code can use DTML to work with FileUploads.
__allow_access_to_unprotected_subobjects__ = 1

def __init__(self, aFieldStorage):
def __init__(self, aFieldStorage, charset):
self.file = aFieldStorage.file
self.headers = aFieldStorage.headers
self.filename = aFieldStorage.filename
self.name = aFieldStorage.name
self.filename = aFieldStorage.filename\
.encode("latin-1").decode(charset)
self.name = aFieldStorage.name.encode("latin-1").decode(charset)

# Add an assertion to the rfc822.Message object that implements
# self.headers so that managed code can access them.
Expand Down
Loading

0 comments on commit 5b324f6

Please sign in to comment.