From 91e6014c6ce4117b49c66703d7f65a65e95820a8 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sun, 12 Nov 2017 14:07:25 +0000 Subject: [PATCH] Moved to single-threaded mode. --- ExtensionCrawler/cdnjs_git.py | 31 +++++++++++-------------------- cdnjs-git-miner | 12 +++--------- scripts/update_cdnjs.sh | 2 +- 3 files changed, 15 insertions(+), 30 deletions(-) diff --git a/ExtensionCrawler/cdnjs_git.py b/ExtensionCrawler/cdnjs_git.py index e939c2b..2b9902d 100644 --- a/ExtensionCrawler/cdnjs_git.py +++ b/ExtensionCrawler/cdnjs_git.py @@ -293,16 +293,11 @@ def chunks(l, n): def update_database(create_csv, release_dic, cdnjs_git_path, - files, - poolsize=16): + files): """Update database for all files in files.""" logging.info("Updating data base") for chunk in chunks(list(files), 200): update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk) -# with Pool(poolsize) as pool: -# pool.map( -# partial(update_database_for_file_chunked, create_csv, release_dic, -# cdnjs_git_path), chunks(list(files), 200)) def get_release_triple(git_path, libver): @@ -314,32 +309,29 @@ def get_release_triple(git_path, libver): return (lib, ver, date) -def build_release_date_dic(git_path, libvers, poolsize=16): +def build_release_date_dic(git_path, libvers): """"Build dictionary of release date with the tuple (library, version) as key.""" logging.info("Building release dictionary") libverdates = [] for libver in libvers: libverdates.append(get_release_triple(git_path, libver)) release_date_dic = {} -# with Pool(poolsize) as pool: -# libverdates = pool.map(partial(get_release_triple, git_path), libvers) - release_date_dic = {} for (lib, ver, date) in libverdates: release_date_dic[(lib, ver)] = date return release_date_dic -def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16): +def pull_and_update_db(cdnjs_git_path, create_csv): """Pull repo and update database.""" logging.info("Pulling and updating data base") files, libvers = pull_get_updated_lib_files(cdnjs_git_path) - release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) + release_dic = build_release_date_dic(cdnjs_git_path, libvers) del libvers gc.collect() - update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) + update_database(create_csv, release_dic, cdnjs_git_path, files) -def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16): +def update_db_from_listfile(cdnjs_git_path, listfile, create_csv): """Update database (without pull) for files in listfile)""" paths = [] with open(listfile) as listfileobj: @@ -352,15 +344,14 @@ def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16): files = files + path_files logging.info("In total, found " + str(len(files)) + " files in " + str(len(libvers)) + " liberies/versions.") - release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) - update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) + release_dic = build_release_date_dic(cdnjs_git_path, libvers) + update_database(create_csv, release_dic, cdnjs_git_path, files) def update_db_all_libs(cdnjs_git_path, create_csv, taskid=1, - maxtaskid=1, - poolsize=16): + maxtaskid=1): """Update database entries for all libs in git repo.""" files, libvers = get_all_lib_files(cdnjs_git_path) @@ -379,7 +370,7 @@ def update_db_all_libs(cdnjs_git_path, logging.info("This task has " + str(len(files)) + " files from " + str(len(libvers)) + " library version(s).") - release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) + release_dic = build_release_date_dic(cdnjs_git_path, libvers) del libvers gc.collect() - update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) + update_database(create_csv, release_dic, cdnjs_git_path, files) diff --git a/cdnjs-git-miner b/cdnjs-git-miner index 66bfd86..ea139e2 100755 --- a/cdnjs-git-miner +++ b/cdnjs-git-miner @@ -34,7 +34,6 @@ def helpmsg(): " -i initialize/update database with all libraries in the repository" ) print(" -u update: pull repository and update database") - print(" -p n update n files in parallel") print( " -l read list of libraries to update from file (recusively)" ) @@ -54,7 +53,6 @@ def main(argv): verbose = False initialize = False update = False - parallel_updates = 5 taskid = 1 listfile = None maxtaskid = 1 @@ -83,8 +81,6 @@ def main(argv): update = True elif opt == '-c': csv = True - elif opt == "-p": - parallel_updates = int(arg) elif opt in ("-n", "--taskid"): taskid = int(arg) elif opt in ("-N", "--maxtaskid"): @@ -105,17 +101,15 @@ def main(argv): if initialize: logging.info("Starting update of all db libs") - update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid, - parallel_updates) + update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid) logging.info("Finished update of all db libs") if update: logging.info("Starting update of new db libs") - pull_and_update_db(cdnjs_git_path, csv, parallel_updates) + pull_and_update_db(cdnjs_git_path, csv) logging.info("Finished update of new db libs") if not listfile is None: logging.info("Starting update from list file") - update_db_from_listfile(cdnjs_git_path, listfile, csv, - parallel_updates) + update_db_from_listfile(cdnjs_git_path, listfile, csv) logging.info("Finished update from list file") logging.info("Successfully updated cdnjs table") diff --git a/scripts/update_cdnjs.sh b/scripts/update_cdnjs.sh index bb4c61a..5542777 100755 --- a/scripts/update_cdnjs.sh +++ b/scripts/update_cdnjs.sh @@ -17,5 +17,5 @@ $SING_EXEC git pull >> $LOG # $SING_EXEC pip3 install --system -e ../ExtensionCrawler # Update cdnjs git repository and update cdnjs data base table -$SING_EXEC ./cdnjs-git-miner -v -p 1 -u -a /opt/archive >> $LOG +$SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG