#!/usr/bin/env python3.6 # # Copyright (C) 2016,2017 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import getopt import glob import os import sys import logging import re from ExtensionCrawler import config def print_help(): print("""extfind [OPTION]""") print(""" -h print this help text""") print(""" -a archive directory""") print(""" -g glob on the extension id, don't use with -e """) print(""" -e file with extension ids, don't use with -g""") print(""" -n process chunk n where n in [1,N]""") print(""" -N """) def split(l, n, N): if n < 1 or n > N: raise ValueError("n must be between 1 and N") chunksize = int(len(l) / N) + 1 # Slicing beyond the list contents returns the empty list return l[chunksize * (n - 1):chunksize * n] def iter_extension_paths_from_file(archive, n, N, extidlistfile): paths = [] with open(extidlistfile, 'r') as f: for line in f.readlines(): path = os.path.join(archive, "data", line[:3], line + ".tar") if re.fullmatch("[a-p]{32}", line) and os.path.exists(path): paths += [path] else: logging.warning("WARNING: {} is not a valid extension path!".format(path)) return split(paths, n, N) def iter_extension_paths(archive, n, N, extidglob="[a-p]"*32): paths = glob.glob(os.path.join(archive, "data", "[a-p]" * 3, extidglob + ".tar")) return split(paths, n, N) def main(argv): archive = config.const_basedir() extidglob = None extidlistfile = None taskid = 1 maxtaskid = 1 try: opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [ "archive=", "glob=", "extidlistfile=", "taskid=", "maxtaskid=", "help" ]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print_help() sys.exit() elif opt in ("-a", "--archive"): archive = arg elif opt in ("-g", "--glob"): extidglob = arg elif opt in ("-e", "--extidlistfile"): extidlistfile = arg elif opt in ("-n", "--taskid"): taskid = int(arg) elif opt in ("-N", "--maxtaskid"): maxtaskid = int(arg) if extidglob is None and extidlistfile is None: paths = iter_extension_paths(archive, taskid, maxtaskid) elif extidglob is None and extidlistfile is not None: paths = iter_extension_paths_from_file(archive, taskid, maxtaskid, extidlistfile) elif extidglob is not None and extidlistfile is None: paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob) else: print_help() sys.exit(2) for path in paths: print(path) if __name__ == "__main__": main(sys.argv[1:])