# Copyright (c) 2015, Google Inc. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. """Extracts archives.""" import hashlib import optparse import os import os.path import tarfile import shutil import sys import zipfile def CheckedJoin(output, path): """ CheckedJoin returns os.path.join(output, path). It does sanity checks to ensure the resulting path is under output, but shouldn't be used on untrusted input. """ path = os.path.normpath(path) if os.path.isabs(path) or path.startswith('.'): raise ValueError(path) return os.path.join(output, path) class FileEntry(object): def __init__(self, path, mode, fileobj): self.path = path self.mode = mode self.fileobj = fileobj class SymlinkEntry(object): def __init__(self, path, mode, target): self.path = path self.mode = mode self.target = target def IterateZip(path): """ IterateZip opens the zip file at path and returns a generator of entry objects for each file in it. """ with zipfile.ZipFile(path, 'r') as zip_file: for info in zip_file.infolist(): if info.filename.endswith('/'): continue yield FileEntry(info.filename, None, zip_file.open(info)) def IterateTar(path, compression): """ IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of entry objects for each file in it. """ with tarfile.open(path, 'r:' + compression) as tar_file: for info in tar_file: if info.isdir(): pass elif info.issym(): yield SymlinkEntry(info.name, None, info.linkname) elif info.isfile(): yield FileEntry(info.name, info.mode, tar_file.extractfile(info)) else: raise ValueError('Unknown entry type "%s"' % (info.name, )) def main(args): parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT') parser.add_option('--no-prefix', dest='no_prefix', action='store_true', help='Do not remove a prefix from paths in the archive.') options, args = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 archive, output = args if not os.path.exists(archive): # Skip archives that weren't downloaded. return 0 with open(archive) as f: sha256 = hashlib.sha256() while True: chunk = f.read(1024 * 1024) if not chunk: break sha256.update(chunk) digest = sha256.hexdigest() stamp_path = os.path.join(output, ".dawn_archive_digest") if os.path.exists(stamp_path): with open(stamp_path) as f: if f.read().strip() == digest: print "Already up-to-date." return 0 if archive.endswith('.zip'): entries = IterateZip(archive) elif archive.endswith('.tar.gz'): entries = IterateTar(archive, 'gz') elif archive.endswith('.tar.bz2'): entries = IterateTar(archive, 'bz2') else: raise ValueError(archive) try: if os.path.exists(output): print "Removing %s" % (output, ) shutil.rmtree(output) print "Extracting %s to %s" % (archive, output) prefix = None num_extracted = 0 for entry in entries: # Even on Windows, zip files must always use forward slashes. if '\\' in entry.path or entry.path.startswith('/'): raise ValueError(entry.path) if not options.no_prefix: new_prefix, rest = entry.path.split('/', 1) # Ensure the archive is consistent. if prefix is None: prefix = new_prefix if prefix != new_prefix: raise ValueError((prefix, new_prefix)) else: rest = entry.path # Extract the file into the output directory. fixed_path = CheckedJoin(output, rest) if not os.path.isdir(os.path.dirname(fixed_path)): os.makedirs(os.path.dirname(fixed_path)) if isinstance(entry, FileEntry): with open(fixed_path, 'wb') as out: shutil.copyfileobj(entry.fileobj, out) elif isinstance(entry, SymlinkEntry): os.symlink(entry.target, fixed_path) else: raise TypeError('unknown entry type') # Fix up permissions if needbe. # TODO(davidben): To be extra tidy, this should only track the execute bit # as in git. if entry.mode is not None: os.chmod(fixed_path, entry.mode) # Print every 100 files, so bots do not time out on large archives. num_extracted += 1 if num_extracted % 100 == 0: print "Extracted %d files..." % (num_extracted, ) finally: entries.close() with open(stamp_path, 'w') as f: f.write(digest) print "Done. Extracted %d files." % (num_extracted, ) return 0 if __name__ == '__main__': sys.exit(main(sys.argv[1:]))