123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- # -*- test-case-name: twisted.python.test.test_zipstream -*-
- # Copyright (c) Twisted Matrix Laboratories.
- # See LICENSE for details.
-
- """
- An incremental approach to unzipping files. This allows you to unzip a little
- bit of a file at a time, which means you can report progress as a file unzips.
- """
-
- import os.path
- import struct
- import zipfile
- import zlib
-
-
- class ChunkingZipFile(zipfile.ZipFile):
- """
- A L{zipfile.ZipFile} object which, with L{readfile}, also gives you access
- to a file-like object for each entry.
- """
-
- def readfile(self, name):
- """
- Return file-like object for name.
- """
- if self.mode not in ("r", "a"):
- raise RuntimeError('read() requires mode "r" or "a"')
- if not self.fp:
- raise RuntimeError("Attempt to read ZIP archive that was already closed")
- zinfo = self.getinfo(name)
-
- self.fp.seek(zinfo.header_offset, 0)
-
- fheader = self.fp.read(zipfile.sizeFileHeader)
- if fheader[0:4] != zipfile.stringFileHeader:
- raise zipfile.BadZipfile("Bad magic number for file header")
-
- fheader = struct.unpack(zipfile.structFileHeader, fheader)
- fname = self.fp.read(fheader[zipfile._FH_FILENAME_LENGTH])
-
- if fheader[zipfile._FH_EXTRA_FIELD_LENGTH]:
- self.fp.read(fheader[zipfile._FH_EXTRA_FIELD_LENGTH])
-
- if zinfo.flag_bits & 0x800:
- # UTF-8 filename
- fname_str = fname.decode("utf-8")
- else:
- fname_str = fname.decode("cp437")
-
- if fname_str != zinfo.orig_filename:
- raise zipfile.BadZipfile(
- 'File name in directory "%s" and header "%s" differ.'
- % (zinfo.orig_filename, fname_str)
- )
-
- if zinfo.compress_type == zipfile.ZIP_STORED:
- return ZipFileEntry(self, zinfo.compress_size)
- elif zinfo.compress_type == zipfile.ZIP_DEFLATED:
- return DeflatedZipFileEntry(self, zinfo.compress_size)
- else:
- raise zipfile.BadZipfile(
- "Unsupported compression method %d for file %s"
- % (zinfo.compress_type, name)
- )
-
-
- class _FileEntry:
- """
- Abstract superclass of both compressed and uncompressed variants of
- file-like objects within a zip archive.
-
- @ivar chunkingZipFile: a chunking zip file.
- @type chunkingZipFile: L{ChunkingZipFile}
-
- @ivar length: The number of bytes within the zip file that represent this
- file. (This is the size on disk, not the number of decompressed bytes
- which will result from reading it.)
-
- @ivar fp: the underlying file object (that contains pkzip data). Do not
- touch this, please. It will quite likely move or go away.
-
- @ivar closed: File-like 'closed' attribute; True before this file has been
- closed, False after.
- @type closed: L{bool}
-
- @ivar finished: An older, broken synonym for 'closed'. Do not touch this,
- please.
- @type finished: L{int}
- """
-
- def __init__(self, chunkingZipFile, length):
- """
- Create a L{_FileEntry} from a L{ChunkingZipFile}.
- """
- self.chunkingZipFile = chunkingZipFile
- self.fp = self.chunkingZipFile.fp
- self.length = length
- self.finished = 0
- self.closed = False
-
- def isatty(self):
- """
- Returns false because zip files should not be ttys
- """
- return False
-
- def close(self):
- """
- Close self (file-like object)
- """
- self.closed = True
- self.finished = 1
- del self.fp
-
- def readline(self):
- """
- Read a line.
- """
- line = b""
- for byte in iter(lambda: self.read(1), b""):
- line += byte
- if byte == b"\n":
- break
- return line
-
- def __next__(self):
- """
- Implement next as file does (like readline, except raises StopIteration
- at EOF)
- """
- nextline = self.readline()
- if nextline:
- return nextline
- raise StopIteration()
-
- # Iterators on Python 2 use next(), not __next__()
- next = __next__
-
- def readlines(self):
- """
- Returns a list of all the lines
- """
- return list(self)
-
- def xreadlines(self):
- """
- Returns an iterator (so self)
- """
- return self
-
- def __iter__(self):
- """
- Returns an iterator (so self)
- """
- return self
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
-
- class ZipFileEntry(_FileEntry):
- """
- File-like object used to read an uncompressed entry in a ZipFile
- """
-
- def __init__(self, chunkingZipFile, length):
- _FileEntry.__init__(self, chunkingZipFile, length)
- self.readBytes = 0
-
- def tell(self):
- return self.readBytes
-
- def read(self, n=None):
- if n is None:
- n = self.length - self.readBytes
- if n == 0 or self.finished:
- return b""
- data = self.chunkingZipFile.fp.read(min(n, self.length - self.readBytes))
- self.readBytes += len(data)
- if self.readBytes == self.length or len(data) < n:
- self.finished = 1
- return data
-
-
- class DeflatedZipFileEntry(_FileEntry):
- """
- File-like object used to read a deflated entry in a ZipFile
- """
-
- def __init__(self, chunkingZipFile, length):
- _FileEntry.__init__(self, chunkingZipFile, length)
- self.returnedBytes = 0
- self.readBytes = 0
- self.decomp = zlib.decompressobj(-15)
- self.buffer = b""
-
- def tell(self):
- return self.returnedBytes
-
- def read(self, n=None):
- if self.finished:
- return b""
- if n is None:
- result = [
- self.buffer,
- ]
- result.append(
- self.decomp.decompress(
- self.chunkingZipFile.fp.read(self.length - self.readBytes)
- )
- )
- result.append(self.decomp.decompress(b"Z"))
- result.append(self.decomp.flush())
- self.buffer = b""
- self.finished = 1
- result = b"".join(result)
- self.returnedBytes += len(result)
- return result
- else:
- while len(self.buffer) < n:
- data = self.chunkingZipFile.fp.read(
- min(n, 1024, self.length - self.readBytes)
- )
- self.readBytes += len(data)
- if not data:
- result = (
- self.buffer + self.decomp.decompress(b"Z") + self.decomp.flush()
- )
- self.finished = 1
- self.buffer = b""
- self.returnedBytes += len(result)
- return result
- else:
- self.buffer += self.decomp.decompress(data)
- result = self.buffer[:n]
- self.buffer = self.buffer[n:]
- self.returnedBytes += len(result)
- return result
-
-
- DIR_BIT = 16
-
-
- def countZipFileChunks(filename, chunksize):
- """
- Predict the number of chunks that will be extracted from the entire
- zipfile, given chunksize blocks.
- """
- totalchunks = 0
- zf = ChunkingZipFile(filename)
- for info in zf.infolist():
- totalchunks += countFileChunks(info, chunksize)
- return totalchunks
-
-
- def countFileChunks(zipinfo, chunksize):
- """
- Count the number of chunks that will result from the given C{ZipInfo}.
-
- @param zipinfo: a C{zipfile.ZipInfo} instance describing an entry in a zip
- archive to be counted.
-
- @return: the number of chunks present in the zip file. (Even an empty file
- counts as one chunk.)
- @rtype: L{int}
- """
- count, extra = divmod(zipinfo.file_size, chunksize)
- if extra > 0:
- count += 1
- return count or 1
-
-
- def unzipIterChunky(filename, directory=".", overwrite=0, chunksize=4096):
- """
- Return a generator for the zipfile. This implementation will yield after
- every chunksize uncompressed bytes, or at the end of a file, whichever
- comes first.
-
- The value it yields is the number of chunks left to unzip.
- """
- czf = ChunkingZipFile(filename, "r")
- if not os.path.exists(directory):
- os.makedirs(directory)
- remaining = countZipFileChunks(filename, chunksize)
- names = czf.namelist()
- infos = czf.infolist()
-
- for entry, info in zip(names, infos):
- isdir = info.external_attr & DIR_BIT
- f = os.path.join(directory, entry)
- if isdir:
- # overwrite flag only applies to files
- if not os.path.exists(f):
- os.makedirs(f)
- remaining -= 1
- yield remaining
- else:
- # create the directory the file will be in first,
- # since we can't guarantee it exists
- fdir = os.path.split(f)[0]
- if not os.path.exists(fdir):
- os.makedirs(fdir)
- if overwrite or not os.path.exists(f):
- fp = czf.readfile(entry)
- if info.file_size == 0:
- remaining -= 1
- yield remaining
- with open(f, "wb") as outfile:
- while fp.tell() < info.file_size:
- hunk = fp.read(chunksize)
- outfile.write(hunk)
- remaining -= 1
- yield remaining
- else:
- remaining -= countFileChunks(info, chunksize)
- yield remaining
|