Moved to single-threaded mode.

This commit is contained in:
Achim D. Brucker 2017-11-12 14:07:25 +00:00
parent 4cb49f2281
commit 91e6014c6c
3 changed files with 15 additions and 30 deletions

View File

@ -293,16 +293,11 @@ def chunks(l, n):
def update_database(create_csv, def update_database(create_csv,
release_dic, release_dic,
cdnjs_git_path, cdnjs_git_path,
files, files):
poolsize=16):
"""Update database for all files in files.""" """Update database for all files in files."""
logging.info("Updating data base") logging.info("Updating data base")
for chunk in chunks(list(files), 200): for chunk in chunks(list(files), 200):
update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk) update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk)
# with Pool(poolsize) as pool:
# pool.map(
# partial(update_database_for_file_chunked, create_csv, release_dic,
# cdnjs_git_path), chunks(list(files), 200))
def get_release_triple(git_path, libver): def get_release_triple(git_path, libver):
@ -314,32 +309,29 @@ def get_release_triple(git_path, libver):
return (lib, ver, date) return (lib, ver, date)
def build_release_date_dic(git_path, libvers, poolsize=16): def build_release_date_dic(git_path, libvers):
""""Build dictionary of release date with the tuple (library, version) as key.""" """"Build dictionary of release date with the tuple (library, version) as key."""
logging.info("Building release dictionary") logging.info("Building release dictionary")
libverdates = [] libverdates = []
for libver in libvers: for libver in libvers:
libverdates.append(get_release_triple(git_path, libver)) libverdates.append(get_release_triple(git_path, libver))
release_date_dic = {} release_date_dic = {}
# with Pool(poolsize) as pool:
# libverdates = pool.map(partial(get_release_triple, git_path), libvers)
release_date_dic = {}
for (lib, ver, date) in libverdates: for (lib, ver, date) in libverdates:
release_date_dic[(lib, ver)] = date release_date_dic[(lib, ver)] = date
return release_date_dic return release_date_dic
def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16): def pull_and_update_db(cdnjs_git_path, create_csv):
"""Pull repo and update database.""" """Pull repo and update database."""
logging.info("Pulling and updating data base") logging.info("Pulling and updating data base")
files, libvers = pull_get_updated_lib_files(cdnjs_git_path) files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) release_dic = build_release_date_dic(cdnjs_git_path, libvers)
del libvers del libvers
gc.collect() gc.collect()
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) update_database(create_csv, release_dic, cdnjs_git_path, files)
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16): def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
"""Update database (without pull) for files in listfile)""" """Update database (without pull) for files in listfile)"""
paths = [] paths = []
with open(listfile) as listfileobj: with open(listfile) as listfileobj:
@ -352,15 +344,14 @@ def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16):
files = files + path_files files = files + path_files
logging.info("In total, found " + str(len(files)) + " files in " + logging.info("In total, found " + str(len(files)) + " files in " +
str(len(libvers)) + " liberies/versions.") str(len(libvers)) + " liberies/versions.")
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) release_dic = build_release_date_dic(cdnjs_git_path, libvers)
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) update_database(create_csv, release_dic, cdnjs_git_path, files)
def update_db_all_libs(cdnjs_git_path, def update_db_all_libs(cdnjs_git_path,
create_csv, create_csv,
taskid=1, taskid=1,
maxtaskid=1, maxtaskid=1):
poolsize=16):
"""Update database entries for all libs in git repo.""" """Update database entries for all libs in git repo."""
files, libvers = get_all_lib_files(cdnjs_git_path) files, libvers = get_all_lib_files(cdnjs_git_path)
@ -379,7 +370,7 @@ def update_db_all_libs(cdnjs_git_path,
logging.info("This task has " + str(len(files)) + " files from " + logging.info("This task has " + str(len(files)) + " files from " +
str(len(libvers)) + " library version(s).") str(len(libvers)) + " library version(s).")
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) release_dic = build_release_date_dic(cdnjs_git_path, libvers)
del libvers del libvers
gc.collect() gc.collect()
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize) update_database(create_csv, release_dic, cdnjs_git_path, files)

View File

@ -34,7 +34,6 @@ def helpmsg():
" -i initialize/update database with all libraries in the repository" " -i initialize/update database with all libraries in the repository"
) )
print(" -u update: pull repository and update database") print(" -u update: pull repository and update database")
print(" -p n update n files in parallel")
print( print(
" -l <PATHFILE> read list of libraries to update from file (recusively)" " -l <PATHFILE> read list of libraries to update from file (recusively)"
) )
@ -54,7 +53,6 @@ def main(argv):
verbose = False verbose = False
initialize = False initialize = False
update = False update = False
parallel_updates = 5
taskid = 1 taskid = 1
listfile = None listfile = None
maxtaskid = 1 maxtaskid = 1
@ -83,8 +81,6 @@ def main(argv):
update = True update = True
elif opt == '-c': elif opt == '-c':
csv = True csv = True
elif opt == "-p":
parallel_updates = int(arg)
elif opt in ("-n", "--taskid"): elif opt in ("-n", "--taskid"):
taskid = int(arg) taskid = int(arg)
elif opt in ("-N", "--maxtaskid"): elif opt in ("-N", "--maxtaskid"):
@ -105,17 +101,15 @@ def main(argv):
if initialize: if initialize:
logging.info("Starting update of all db libs") logging.info("Starting update of all db libs")
update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid, update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid)
parallel_updates)
logging.info("Finished update of all db libs") logging.info("Finished update of all db libs")
if update: if update:
logging.info("Starting update of new db libs") logging.info("Starting update of new db libs")
pull_and_update_db(cdnjs_git_path, csv, parallel_updates) pull_and_update_db(cdnjs_git_path, csv)
logging.info("Finished update of new db libs") logging.info("Finished update of new db libs")
if not listfile is None: if not listfile is None:
logging.info("Starting update from list file") logging.info("Starting update from list file")
update_db_from_listfile(cdnjs_git_path, listfile, csv, update_db_from_listfile(cdnjs_git_path, listfile, csv)
parallel_updates)
logging.info("Finished update from list file") logging.info("Finished update from list file")
logging.info("Successfully updated cdnjs table") logging.info("Successfully updated cdnjs table")

View File

@ -17,5 +17,5 @@ $SING_EXEC git pull >> $LOG
# $SING_EXEC pip3 install --system -e ../ExtensionCrawler # $SING_EXEC pip3 install --system -e ../ExtensionCrawler
# Update cdnjs git repository and update cdnjs data base table # Update cdnjs git repository and update cdnjs data base table
$SING_EXEC ./cdnjs-git-miner -v -p 1 -u -a /opt/archive >> $LOG $SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG