diff --git a/create-db b/create-db index 6aedae5..017ede2 100755 --- a/create-db +++ b/create-db @@ -34,19 +34,21 @@ from ExtensionCrawler.util import log_info, log_warning, log_error, log_exceptio def help(): - print("create-db [OPTION]") - print(" -h print this help text") - print(" -a archive directory") - print(" -p three-letter-prefix") - print(" -e file with extension ids") - print(" --from-date only process information gathered after this date") - print(" (compared lexographically)") - print(" -t number of parallel threads") - print(" -n process chunk n where n in [1,N]") - print(" -N ") + print("""create-db [OPTION]""") + print(""" -h print this help text""") + print(""" -a archive directory""") + print(""" -p three-letter-prefix""") + print(""" -e file with extension ids""") + print(""" --from-date only process information gathered after""" + """ this date (compared lexographically)""") + print(""" --until-date only process information gathered before""" + """ this date (compared lexographically)""") + print(""" -t number of parallel threads""") + print(""" -n process chunk n where n in [1,N]""") + print(""" -N """) -def process_id(from_date, path): +def process_id(from_date, until_date, path): start = time.time() with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(path) as t: @@ -57,7 +59,8 @@ def process_id(from_date, path): iddir = os.path.join(tmpdir, extid) for date in sorted(os.listdir(iddir)): - if from_date is not None and date < from_date: + if (from_date is not None and date < from_date) or \ + (until_date is not None and date > until_date): log_info("* Skipping {}".format(date), 2, extid) continue try: @@ -97,13 +100,14 @@ def parse_args(argv): taskid = 1 maxtaskid = 1 from_date = None + until_date = None paths = [] try: opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [ "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=", - "maxtaskid=", "from-date=", "help" + "maxtaskid=", "from-date=", "until-date=", "help" ]) except getopt.GetoptError: help() @@ -126,6 +130,8 @@ def parse_args(argv): maxtaskid = int(arg) elif opt in ("--from-date"): from_date = arg + elif opt in ("--until-date"): + until_date = arg if paths == []: paths = list(find(archive, "*")) @@ -136,16 +142,16 @@ def parse_args(argv): else: paths = paths[(taskid - 1) * chunksize:taskid * chunksize] - return paths, parallel, from_date + return paths, parallel, from_date, until_date def main(argv): logging.basicConfig(level=logging.INFO, format=const_log_format()) - paths, parallel, from_date = parse_args(argv) + paths, parallel, from_date, until_date = parse_args(argv) with Pool(processes=parallel) as p: - p.map(partial(process_id, from_date), paths) + p.map(partial(process_id, from_date, until_date), paths) if __name__ == "__main__":