#!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import getopt import os import sys import glob import tarfile import tempfile import traceback from multiprocessing import Pool from ExtensionCrawler.archive import * from ExtensionCrawler.config import * def help(): print("create-db [OPTION] DBBASEDIR") print(" DBBASEDIR directory for generated db files") print(" -h print this help text") print(" -a= archive directory") print(" -p= three-letter-prefix") print(" -t= number of parallel threads") def process_id(archivedir, dbbasedir, verbose, ext_id): txt = "" txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id)) tarpath = archive_file(archivedir, ext_id) dbpath = os.path.join(dbbasedir, ext_id + ".sqlite") if os.path.exists(dbpath): os.remove(dbpath) with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(tarpath) as t: t.extractall(tmpdir) iddir = os.path.join(tmpdir, ext_id) for date in sorted(os.listdir(iddir)): try: update_txt = update_sqlite_incremental( dbpath, iddir, ext_id, date, True, "") txt = logmsg(verbose, txt, update_txt) except Exception: txt = logmsg(verbose, txt, "Exception when handling {} on {}:\n".format( ext_id, date)) txt = logmsg(verbose, txt, traceback.format_exc()) txt = logmsg(verbose, txt, "\n") return txt def main(argv): basedir = "archive" prefix = "" parallel = 8 try: opts, args = getopt.getopt(argv, "ha:p:t:", ["archive=", "prefix=", "threads="]) except getopt.GetoptError: help() sys.exit(2) for opt, arg in opts: if opt == '-h': help() sys.exit() elif opt in ("-a", "--archive"): basedir = arg elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-t", "--threads"): parallel = int(arg) if len(args) < 1: help() sys.exit(2) dbbasedir = args[0] archivedir = os.path.join(basedir, "data") threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*")) for threeletterdir in threeletterdirs: ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)])) with Pool(parallel) as p: for txt in p.imap(partial(process_id, archivedir, dbbasedir, True), ext_ids): sys.stdout.write(txt) sys.stdout.flush() if __name__ == "__main__": main(sys.argv[1:])