123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796 |
- """
- Multi-part parsing for file uploads.
-
- Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
- file upload handlers for processing.
- """
- import base64
- import binascii
- import collections
- import html
- from urllib.parse import unquote
-
- from django.conf import settings
- from django.core.exceptions import (
- RequestDataTooBig,
- SuspiciousMultipartForm,
- TooManyFieldsSent,
- TooManyFilesSent,
- )
- from django.core.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload
- from django.utils.datastructures import MultiValueDict
- from django.utils.encoding import force_str
- from django.utils.regex_helper import _lazy_re_compile
-
- __all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")
-
-
- class MultiPartParserError(Exception):
- pass
-
-
- class InputStreamExhausted(Exception):
- """
- No more reads are allowed from this device.
- """
-
- pass
-
-
- RAW = "raw"
- FILE = "file"
- FIELD = "field"
- FIELD_TYPES = frozenset([FIELD, RAW])
-
-
- class MultiPartParser:
- """
- A rfc2388 multipart/form-data parser.
-
- ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
- and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.
- """
-
- boundary_re = _lazy_re_compile(rb"[ -~]{0,200}[!-~]")
-
- def __init__(self, META, input_data, upload_handlers, encoding=None):
- """
- Initialize the MultiPartParser object.
-
- :META:
- The standard ``META`` dictionary in Django request objects.
- :input_data:
- The raw post data, as a file-like object.
- :upload_handlers:
- A list of UploadHandler instances that perform operations on the
- uploaded data.
- :encoding:
- The encoding with which to treat the incoming data.
- """
- # Content-Type should contain multipart and the boundary information.
- content_type = META.get("CONTENT_TYPE", "")
- if not content_type.startswith("multipart/"):
- raise MultiPartParserError("Invalid Content-Type: %s" % content_type)
-
- # Parse the header to get the boundary to split the parts.
- try:
- ctypes, opts = parse_header(content_type.encode("ascii"))
- except UnicodeEncodeError:
- raise MultiPartParserError(
- "Invalid non-ASCII Content-Type in multipart: %s"
- % force_str(content_type)
- )
- boundary = opts.get("boundary")
- if not boundary or not self.boundary_re.fullmatch(boundary):
- raise MultiPartParserError(
- "Invalid boundary in multipart: %s" % force_str(boundary)
- )
-
- # Content-Length should contain the length of the body we are about
- # to receive.
- try:
- content_length = int(META.get("CONTENT_LENGTH", 0))
- except (ValueError, TypeError):
- content_length = 0
-
- if content_length < 0:
- # This means we shouldn't continue...raise an error.
- raise MultiPartParserError("Invalid content length: %r" % content_length)
-
- if isinstance(boundary, str):
- boundary = boundary.encode("ascii")
- self._boundary = boundary
- self._input_data = input_data
-
- # For compatibility with low-level network APIs (with 32-bit integers),
- # the chunk size should be < 2^31, but still divisible by 4.
- possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
- self._chunk_size = min([2**31 - 4] + possible_sizes)
-
- self._meta = META
- self._encoding = encoding or settings.DEFAULT_CHARSET
- self._content_length = content_length
- self._upload_handlers = upload_handlers
-
- def parse(self):
- # Call the actual parse routine and close all open files in case of
- # errors. This is needed because if exceptions are thrown the
- # MultiPartParser will not be garbage collected immediately and
- # resources would be kept alive. This is only needed for errors because
- # the Request object closes all uploaded files at the end of the
- # request.
- try:
- return self._parse()
- except Exception:
- if hasattr(self, "_files"):
- for _, files in self._files.lists():
- for fileobj in files:
- fileobj.close()
- raise
-
- def _parse(self):
- """
- Parse the POST data and break it into a FILES MultiValueDict and a POST
- MultiValueDict.
-
- Return a tuple containing the POST and FILES dictionary, respectively.
- """
- from django.http import QueryDict
-
- encoding = self._encoding
- handlers = self._upload_handlers
-
- # HTTP spec says that Content-Length >= 0 is valid
- # handling content-length == 0 before continuing
- if self._content_length == 0:
- return QueryDict(encoding=self._encoding), MultiValueDict()
-
- # See if any of the handlers take care of the parsing.
- # This allows overriding everything if need be.
- for handler in handlers:
- result = handler.handle_raw_input(
- self._input_data,
- self._meta,
- self._content_length,
- self._boundary,
- encoding,
- )
- # Check to see if it was handled
- if result is not None:
- return result[0], result[1]
-
- # Create the data structures to be used later.
- self._post = QueryDict(mutable=True)
- self._files = MultiValueDict()
-
- # Instantiate the parser and stream:
- stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))
-
- # Whether or not to signal a file-completion at the beginning of the loop.
- old_field_name = None
- counters = [0] * len(handlers)
-
- # Number of bytes that have been read.
- num_bytes_read = 0
- # To count the number of keys in the request.
- num_post_keys = 0
- # To count the number of files in the request.
- num_files = 0
- # To limit the amount of data read from the request.
- read_size = None
- # Whether a file upload is finished.
- uploaded_file = True
-
- try:
- for item_type, meta_data, field_stream in Parser(stream, self._boundary):
- if old_field_name:
- # We run this at the beginning of the next loop
- # since we cannot be sure a file is complete until
- # we hit the next boundary/part of the multipart content.
- self.handle_file_complete(old_field_name, counters)
- old_field_name = None
- uploaded_file = True
-
- if (
- item_type in FIELD_TYPES
- and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None
- ):
- # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.
- num_post_keys += 1
- # 2 accounts for empty raw fields before and after the
- # last boundary.
- if settings.DATA_UPLOAD_MAX_NUMBER_FIELDS + 2 < num_post_keys:
- raise TooManyFieldsSent(
- "The number of GET/POST parameters exceeded "
- "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."
- )
-
- try:
- disposition = meta_data["content-disposition"][1]
- field_name = disposition["name"].strip()
- except (KeyError, IndexError, AttributeError):
- continue
-
- transfer_encoding = meta_data.get("content-transfer-encoding")
- if transfer_encoding is not None:
- transfer_encoding = transfer_encoding[0].strip()
- field_name = force_str(field_name, encoding, errors="replace")
-
- if item_type == FIELD:
- # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.
- if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:
- read_size = (
- settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read
- )
-
- # This is a post field, we can just set it in the post
- if transfer_encoding == "base64":
- raw_data = field_stream.read(size=read_size)
- num_bytes_read += len(raw_data)
- try:
- data = base64.b64decode(raw_data)
- except binascii.Error:
- data = raw_data
- else:
- data = field_stream.read(size=read_size)
- num_bytes_read += len(data)
-
- # Add two here to make the check consistent with the
- # x-www-form-urlencoded check that includes '&='.
- num_bytes_read += len(field_name) + 2
- if (
- settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None
- and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE
- ):
- raise RequestDataTooBig(
- "Request body exceeded "
- "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."
- )
-
- self._post.appendlist(
- field_name, force_str(data, encoding, errors="replace")
- )
- elif item_type == FILE:
- # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FILES.
- num_files += 1
- if (
- settings.DATA_UPLOAD_MAX_NUMBER_FILES is not None
- and num_files > settings.DATA_UPLOAD_MAX_NUMBER_FILES
- ):
- raise TooManyFilesSent(
- "The number of files exceeded "
- "settings.DATA_UPLOAD_MAX_NUMBER_FILES."
- )
- # This is a file, use the handler...
- file_name = disposition.get("filename")
- if file_name:
- file_name = force_str(file_name, encoding, errors="replace")
- file_name = self.sanitize_file_name(file_name)
- if not file_name:
- continue
-
- content_type, content_type_extra = meta_data.get(
- "content-type", ("", {})
- )
- content_type = content_type.strip()
- charset = content_type_extra.get("charset")
-
- try:
- content_length = int(meta_data.get("content-length")[0])
- except (IndexError, TypeError, ValueError):
- content_length = None
-
- counters = [0] * len(handlers)
- uploaded_file = False
- try:
- for handler in handlers:
- try:
- handler.new_file(
- field_name,
- file_name,
- content_type,
- content_length,
- charset,
- content_type_extra,
- )
- except StopFutureHandlers:
- break
-
- for chunk in field_stream:
- if transfer_encoding == "base64":
- # We only special-case base64 transfer encoding
- # We should always decode base64 chunks by
- # multiple of 4, ignoring whitespace.
-
- stripped_chunk = b"".join(chunk.split())
-
- remaining = len(stripped_chunk) % 4
- while remaining != 0:
- over_chunk = field_stream.read(4 - remaining)
- if not over_chunk:
- break
- stripped_chunk += b"".join(over_chunk.split())
- remaining = len(stripped_chunk) % 4
-
- try:
- chunk = base64.b64decode(stripped_chunk)
- except Exception as exc:
- # Since this is only a chunk, any error is
- # an unfixable error.
- raise MultiPartParserError(
- "Could not decode base64 data."
- ) from exc
-
- for i, handler in enumerate(handlers):
- chunk_length = len(chunk)
- chunk = handler.receive_data_chunk(chunk, counters[i])
- counters[i] += chunk_length
- if chunk is None:
- # Don't continue if the chunk received by
- # the handler is None.
- break
-
- except SkipFile:
- self._close_files()
- # Just use up the rest of this file...
- exhaust(field_stream)
- else:
- # Handle file upload completions on next iteration.
- old_field_name = field_name
- else:
- # If this is neither a FIELD nor a FILE, exhaust the field
- # stream. Note: There could be an error here at some point,
- # but there will be at least two RAW types (before and
- # after the other boundaries). This branch is usually not
- # reached at all, because a missing content-disposition
- # header will skip the whole boundary.
- exhaust(field_stream)
- except StopUpload as e:
- self._close_files()
- if not e.connection_reset:
- exhaust(self._input_data)
- else:
- if not uploaded_file:
- for handler in handlers:
- handler.upload_interrupted()
- # Make sure that the request data is all fed
- exhaust(self._input_data)
-
- # Signal that the upload has completed.
- # any() shortcircuits if a handler's upload_complete() returns a value.
- any(handler.upload_complete() for handler in handlers)
- self._post._mutable = False
- return self._post, self._files
-
- def handle_file_complete(self, old_field_name, counters):
- """
- Handle all the signaling that takes place when a file is complete.
- """
- for i, handler in enumerate(self._upload_handlers):
- file_obj = handler.file_complete(counters[i])
- if file_obj:
- # If it returns a file object, then set the files dict.
- self._files.appendlist(
- force_str(old_field_name, self._encoding, errors="replace"),
- file_obj,
- )
- break
-
- def sanitize_file_name(self, file_name):
- """
- Sanitize the filename of an upload.
-
- Remove all possible path separators, even though that might remove more
- than actually required by the target system. Filenames that could
- potentially cause problems (current/parent dir) are also discarded.
-
- It should be noted that this function could still return a "filepath"
- like "C:some_file.txt" which is handled later on by the storage layer.
- So while this function does sanitize filenames to some extent, the
- resulting filename should still be considered as untrusted user input.
- """
- file_name = html.unescape(file_name)
- file_name = file_name.rsplit("/")[-1]
- file_name = file_name.rsplit("\\")[-1]
- # Remove non-printable characters.
- file_name = "".join([char for char in file_name if char.isprintable()])
-
- if file_name in {"", ".", ".."}:
- return None
- return file_name
-
- IE_sanitize = sanitize_file_name
-
- def _close_files(self):
- # Free up all file handles.
- # FIXME: this currently assumes that upload handlers store the file as 'file'
- # We should document that...
- # (Maybe add handler.free_file to complement new_file)
- for handler in self._upload_handlers:
- if hasattr(handler, "file"):
- handler.file.close()
-
-
- class LazyStream:
- """
- The LazyStream wrapper allows one to get and "unget" bytes from a stream.
-
- Given a producer object (an iterator that yields bytestrings), the
- LazyStream object will support iteration, reading, and keeping a "look-back"
- variable in case you need to "unget" some bytes.
- """
-
- def __init__(self, producer, length=None):
- """
- Every LazyStream must have a producer when instantiated.
-
- A producer is an iterable that returns a string each time it
- is called.
- """
- self._producer = producer
- self._empty = False
- self._leftover = b""
- self.length = length
- self.position = 0
- self._remaining = length
- self._unget_history = []
-
- def tell(self):
- return self.position
-
- def read(self, size=None):
- def parts():
- remaining = self._remaining if size is None else size
- # do the whole thing in one shot if no limit was provided.
- if remaining is None:
- yield b"".join(self)
- return
-
- # otherwise do some bookkeeping to return exactly enough
- # of the stream and stashing any extra content we get from
- # the producer
- while remaining != 0:
- assert remaining > 0, "remaining bytes to read should never go negative"
-
- try:
- chunk = next(self)
- except StopIteration:
- return
- else:
- emitting = chunk[:remaining]
- self.unget(chunk[remaining:])
- remaining -= len(emitting)
- yield emitting
-
- return b"".join(parts())
-
- def __next__(self):
- """
- Used when the exact number of bytes to read is unimportant.
-
- Return whatever chunk is conveniently returned from the iterator.
- Useful to avoid unnecessary bookkeeping if performance is an issue.
- """
- if self._leftover:
- output = self._leftover
- self._leftover = b""
- else:
- output = next(self._producer)
- self._unget_history = []
- self.position += len(output)
- return output
-
- def close(self):
- """
- Used to invalidate/disable this lazy stream.
-
- Replace the producer with an empty list. Any leftover bytes that have
- already been read will still be reported upon read() and/or next().
- """
- self._producer = []
-
- def __iter__(self):
- return self
-
- def unget(self, bytes):
- """
- Place bytes back onto the front of the lazy stream.
-
- Future calls to read() will return those bytes first. The
- stream position and thus tell() will be rewound.
- """
- if not bytes:
- return
- self._update_unget_history(len(bytes))
- self.position -= len(bytes)
- self._leftover = bytes + self._leftover
-
- def _update_unget_history(self, num_bytes):
- """
- Update the unget history as a sanity check to see if we've pushed
- back the same number of bytes in one chunk. If we keep ungetting the
- same number of bytes many times (here, 50), we're mostly likely in an
- infinite loop of some sort. This is usually caused by a
- maliciously-malformed MIME request.
- """
- self._unget_history = [num_bytes] + self._unget_history[:49]
- number_equal = len(
- [
- current_number
- for current_number in self._unget_history
- if current_number == num_bytes
- ]
- )
-
- if number_equal > 40:
- raise SuspiciousMultipartForm(
- "The multipart parser got stuck, which shouldn't happen with"
- " normal uploaded files. Check for malicious upload activity;"
- " if there is none, report this to the Django developers."
- )
-
-
- class ChunkIter:
- """
- An iterable that will yield chunks of data. Given a file-like object as the
- constructor, yield chunks of read operations from that object.
- """
-
- def __init__(self, flo, chunk_size=64 * 1024):
- self.flo = flo
- self.chunk_size = chunk_size
-
- def __next__(self):
- try:
- data = self.flo.read(self.chunk_size)
- except InputStreamExhausted:
- raise StopIteration()
- if data:
- return data
- else:
- raise StopIteration()
-
- def __iter__(self):
- return self
-
-
- class InterBoundaryIter:
- """
- A Producer that will iterate over boundaries.
- """
-
- def __init__(self, stream, boundary):
- self._stream = stream
- self._boundary = boundary
-
- def __iter__(self):
- return self
-
- def __next__(self):
- try:
- return LazyStream(BoundaryIter(self._stream, self._boundary))
- except InputStreamExhausted:
- raise StopIteration()
-
-
- class BoundaryIter:
- """
- A Producer that is sensitive to boundaries.
-
- Will happily yield bytes until a boundary is found. Will yield the bytes
- before the boundary, throw away the boundary bytes themselves, and push the
- post-boundary bytes back on the stream.
-
- The future calls to next() after locating the boundary will raise a
- StopIteration exception.
- """
-
- def __init__(self, stream, boundary):
- self._stream = stream
- self._boundary = boundary
- self._done = False
- # rollback an additional six bytes because the format is like
- # this: CRLF<boundary>[--CRLF]
- self._rollback = len(boundary) + 6
-
- # Try to use mx fast string search if available. Otherwise
- # use Python find. Wrap the latter for consistency.
- unused_char = self._stream.read(1)
- if not unused_char:
- raise InputStreamExhausted()
- self._stream.unget(unused_char)
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if self._done:
- raise StopIteration()
-
- stream = self._stream
- rollback = self._rollback
-
- bytes_read = 0
- chunks = []
- for bytes in stream:
- bytes_read += len(bytes)
- chunks.append(bytes)
- if bytes_read > rollback:
- break
- if not bytes:
- break
- else:
- self._done = True
-
- if not chunks:
- raise StopIteration()
-
- chunk = b"".join(chunks)
- boundary = self._find_boundary(chunk)
-
- if boundary:
- end, next = boundary
- stream.unget(chunk[next:])
- self._done = True
- return chunk[:end]
- else:
- # make sure we don't treat a partial boundary (and
- # its separators) as data
- if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):
- # There's nothing left, we should just return and mark as done.
- self._done = True
- return chunk
- else:
- stream.unget(chunk[-rollback:])
- return chunk[:-rollback]
-
- def _find_boundary(self, data):
- """
- Find a multipart boundary in data.
-
- Should no boundary exist in the data, return None. Otherwise, return
- a tuple containing the indices of the following:
- * the end of current encapsulation
- * the start of the next encapsulation
- """
- index = data.find(self._boundary)
- if index < 0:
- return None
- else:
- end = index
- next = index + len(self._boundary)
- # backup over CRLF
- last = max(0, end - 1)
- if data[last : last + 1] == b"\n":
- end -= 1
- last = max(0, end - 1)
- if data[last : last + 1] == b"\r":
- end -= 1
- return end, next
-
-
- def exhaust(stream_or_iterable):
- """Exhaust an iterator or stream."""
- try:
- iterator = iter(stream_or_iterable)
- except TypeError:
- iterator = ChunkIter(stream_or_iterable, 16384)
- collections.deque(iterator, maxlen=0) # consume iterator quickly.
-
-
- def parse_boundary_stream(stream, max_header_size):
- """
- Parse one and exactly one stream that encapsulates a boundary.
- """
- # Stream at beginning of header, look for end of header
- # and parse it if found. The header must fit within one
- # chunk.
- chunk = stream.read(max_header_size)
-
- # 'find' returns the top of these four bytes, so we'll
- # need to munch them later to prevent them from polluting
- # the payload.
- header_end = chunk.find(b"\r\n\r\n")
-
- def _parse_header(line):
- main_value_pair, params = parse_header(line)
- try:
- name, value = main_value_pair.split(":", 1)
- except ValueError:
- raise ValueError("Invalid header: %r" % line)
- return name, (value, params)
-
- if header_end == -1:
- # we find no header, so we just mark this fact and pass on
- # the stream verbatim
- stream.unget(chunk)
- return (RAW, {}, stream)
-
- header = chunk[:header_end]
-
- # here we place any excess chunk back onto the stream, as
- # well as throwing away the CRLFCRLF bytes from above.
- stream.unget(chunk[header_end + 4 :])
-
- TYPE = RAW
- outdict = {}
-
- # Eliminate blank lines
- for line in header.split(b"\r\n"):
- # This terminology ("main value" and "dictionary of
- # parameters") is from the Python docs.
- try:
- name, (value, params) = _parse_header(line)
- except ValueError:
- continue
-
- if name == "content-disposition":
- TYPE = FIELD
- if params.get("filename"):
- TYPE = FILE
-
- outdict[name] = value, params
-
- if TYPE == RAW:
- stream.unget(chunk)
-
- return (TYPE, outdict, stream)
-
-
- class Parser:
- def __init__(self, stream, boundary):
- self._stream = stream
- self._separator = b"--" + boundary
-
- def __iter__(self):
- boundarystream = InterBoundaryIter(self._stream, self._separator)
- for sub_stream in boundarystream:
- # Iterate over each part
- yield parse_boundary_stream(sub_stream, 1024)
-
-
- def parse_header(line):
- """
- Parse the header into a key-value.
-
- Input (line): bytes, output: str for key/name, bytes for values which
- will be decoded later.
- """
- plist = _parse_header_params(b";" + line)
- key = plist.pop(0).lower().decode("ascii")
- pdict = {}
- for p in plist:
- i = p.find(b"=")
- if i >= 0:
- has_encoding = False
- name = p[:i].strip().lower().decode("ascii")
- if name.endswith("*"):
- # Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext")
- # https://tools.ietf.org/html/rfc2231#section-4
- name = name[:-1]
- if p.count(b"'") == 2:
- has_encoding = True
- value = p[i + 1 :].strip()
- if len(value) >= 2 and value[:1] == value[-1:] == b'"':
- value = value[1:-1]
- value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
- if has_encoding:
- encoding, lang, value = value.split(b"'")
- value = unquote(value.decode(), encoding=encoding.decode())
- pdict[name] = value
- return key, pdict
-
-
- def _parse_header_params(s):
- plist = []
- while s[:1] == b";":
- s = s[1:]
- end = s.find(b";")
- while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
- end = s.find(b";", end + 1)
- if end < 0:
- end = len(s)
- f = s[:end]
- plist.append(f.strip())
- s = s[end:]
- return plist
|