bpo-29979: Rewrite cgi.parse_multipart to make it consistent with FieldStorage (#991)

2025-12-04 00:30:19 +00:00 · 2017-05-08 14:08:34 +02:00 · 2017-05-08 14:08:34 +02:00 · cc3fa204d3
commit cc3fa204d3
parent f34c685020
5 changed files with 38 additions and 102 deletions
--- a/Lib/cgi.py
+++ b/Lib/cgi.py
@ -198,105 +198,28 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
         DeprecationWarning, 2)
    return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing)

-def parse_multipart(fp, pdict):
+def parse_multipart(fp, pdict, encoding="utf-8"):
    """Parse multipart input.

    Arguments:
    fp   : input file
    pdict: dictionary containing other parameters of content-type header
+    encoding: request encoding

    Returns a dictionary just like parse_qs(): keys are the field names, each
-    value is a list of values for that field.  This is easy to use but not
-    much good if you are expecting megabytes to be uploaded -- in that case,
-    use the FieldStorage class instead which is much more flexible.  Note
-    that content-type is the raw, unparsed contents of the content-type
-    header.
-
-    XXX This does not parse nested multipart parts -- use FieldStorage for
-    that.
-
-    XXX This should really be subsumed by FieldStorage altogether -- no
-    point in having two implementations of the same parsing algorithm.
-    Also, FieldStorage protects itself better against certain DoS attacks
-    by limiting the size of the data read in one chunk.  The API here
-    does not support that kind of protection.  This also affects parse()
-    since it can call parse_multipart().
-
+    value is a list of values for that field. For non-file fields, the value
+    is a list of strings.
    """
-    import http.client
-
-    boundary = b""
-    if 'boundary' in pdict:
-        boundary = pdict['boundary']
-    if not valid_boundary(boundary):
-        raise ValueError('Invalid boundary in multipart form: %r'
-                            % (boundary,))
-
-    nextpart = b"--" + boundary
-    lastpart = b"--" + boundary + b"--"
-    partdict = {}
-    terminator = b""
-
-    while terminator != lastpart:
-        bytes = -1
-        data = None
-        if terminator:
-            # At start of next part.  Read headers first.
-            headers = http.client.parse_headers(fp)
-            clength = headers.get('content-length')
-            if clength:
-                try:
-                    bytes = int(clength)
-                except ValueError:
-                    pass
-            if bytes > 0:
-                if maxlen and bytes > maxlen:
-                    raise ValueError('Maximum content length exceeded')
-                data = fp.read(bytes)
-            else:
-                data = b""
-        # Read lines until end of part.
-        lines = []
-        while 1:
-            line = fp.readline()
-            if not line:
-                terminator = lastpart # End outer loop
-                break
-            if line.startswith(b"--"):
-                terminator = line.rstrip()
-                if terminator in (nextpart, lastpart):
-                    break
-            lines.append(line)
-        # Done with part.
-        if data is None:
-            continue
-        if bytes < 0:
-            if lines:
-                # Strip final line terminator
-                line = lines[-1]
-                if line[-2:] == b"\r\n":
-                    line = line[:-2]
-                elif line[-1:] == b"\n":
-                    line = line[:-1]
-                lines[-1] = line
-                data = b"".join(lines)
-        line = headers['content-disposition']
-        if not line:
-            continue
-        key, params = parse_header(line)
-        if key != 'form-data':
-            continue
-        if 'name' in params:
-            name = params['name']
-        else:
-            continue
-        if name in partdict:
-            partdict[name].append(data)
-        else:
-            partdict[name] = [data]
-
-    return partdict
-
+    # RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always
+    # represented as 7bit US-ASCII.
+    boundary = pdict['boundary'].decode('ascii')
+    ctype = "multipart/form-data; boundary={}".format(boundary)
+    headers = Message()
+    headers.set_type(ctype)
+    headers['Content-Length'] = pdict['CONTENT-LENGTH']
+    fs = FieldStorage(fp, headers=headers, encoding=encoding,
+        environ={'REQUEST_METHOD': 'POST'})
+    return {k: fs.getlist(k) for k in fs}

 def _parseparam(s):
    while s[:1] == ';':