#!/usr/bin/env python3.7 # # Copyright (C) 2017-2018 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # SPDX-License-Identifier: GPL-3.0-or-later """Tool for extracting crx file from a tar archive.""" import os import sys import glob import getopt import tarfile import datetime import dateutil import dateutil.parser from ExtensionCrawler.archive import last_crx, get_local_archive_dir from ExtensionCrawler.config import const_basedir def helpmsg(): """Print help message.""" print("crx-extract [OPTION] extid") print(" -h print this help text") print(" -s silent (no log messages)") print(" -e use etag instead of date in outoput") print(" -w avoid ':' in filenames (useful on Windows)") print(" -d= date") print(" -o= output directory") print(" -a= archive directory") def get_tarinfo(members, name, winfs=False, etag=None): """Select tarinfo object with a specified path/name.""" for tarinfo in members: if tarinfo.name == name: if winfs: tarinfo.name = name.replace(":", "-") if etag is not None: (path, crx) = os.path.split(tarinfo.name) (path, _) = os.path.split(path) tarinfo.name = os.path.join(path, etag, crx) yield tarinfo def main(argv): """Main function of the extension crawler.""" basedir = const_basedir() verbose = True date = None useetag = False output = "" winfs = False try: opts, args = getopt.getopt(argv, "hsed:a:o:w", ["date=", "archive=", "output="]) except getopt.GetoptError: helpmsg() sys.exit(2) for opt, arg in opts: if opt == '-h': helpmsg() sys.exit() elif opt in ("-a", "--archive"): basedir = arg elif opt in ("-d", "--date"): date = arg elif opt in ("-o", "--output"): output = arg elif opt in ("-w", "--winfs"): winfs = True elif opt in ("-e", "--etag"): useetag = True elif opt == '-s': verbose = False if len(args) > 0: extid = args[0] else: helpmsg() sys.exit() if date is not None: dateobj = dateutil.parser.parse(date) if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None: dateobj = dateobj.replace(tzinfo=datetime.timezone.utc) last, etag = last_crx(os.path.join(basedir, "data"), extid, dateobj) else: last, etag = last_crx(os.path.join(basedir, "data"), extid) if not useetag: etag = None basetar = os.path.join(basedir, "data", get_local_archive_dir(extid), extid) tar = basetar+".tar" if last != "": if os.path.exists(tar): files = None if verbose: print("Extracting " + os.path.join(output, last) + " from " + tar) with tarfile.open(tar, 'r') as archive: files = archive.extractall( path=output, members=get_tarinfo(archive, last, winfs, etag)) archivetars = sorted(glob.glob(basetar+".[0-9][0-9][0-9].tar.xz")) while (not files and archivetars): tar = archivetars.pop() if verbose: print("Extracting " + os.path.join(output, last) + " from " + tar) with tarfile.open(tar, 'r:xz') as archive: files = archive.extractall( path=output, members=get_tarinfo(archive, last, winfs, etag)) elif verbose: print("Cannot find archive " + tar) elif verbose: if os.path.exists(tar): print("CRX not in archive" + tar) else: print("CRX does not exist: cannot find archive " + tar) if __name__ == "__main__": main(sys.argv[1:])