Moved to single-threaded mode.
This commit is contained in:
parent
4cb49f2281
commit
91e6014c6c
|
@ -293,16 +293,11 @@ def chunks(l, n):
|
||||||
def update_database(create_csv,
|
def update_database(create_csv,
|
||||||
release_dic,
|
release_dic,
|
||||||
cdnjs_git_path,
|
cdnjs_git_path,
|
||||||
files,
|
files):
|
||||||
poolsize=16):
|
|
||||||
"""Update database for all files in files."""
|
"""Update database for all files in files."""
|
||||||
logging.info("Updating data base")
|
logging.info("Updating data base")
|
||||||
for chunk in chunks(list(files), 200):
|
for chunk in chunks(list(files), 200):
|
||||||
update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk)
|
update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk)
|
||||||
# with Pool(poolsize) as pool:
|
|
||||||
# pool.map(
|
|
||||||
# partial(update_database_for_file_chunked, create_csv, release_dic,
|
|
||||||
# cdnjs_git_path), chunks(list(files), 200))
|
|
||||||
|
|
||||||
|
|
||||||
def get_release_triple(git_path, libver):
|
def get_release_triple(git_path, libver):
|
||||||
|
@ -314,32 +309,29 @@ def get_release_triple(git_path, libver):
|
||||||
return (lib, ver, date)
|
return (lib, ver, date)
|
||||||
|
|
||||||
|
|
||||||
def build_release_date_dic(git_path, libvers, poolsize=16):
|
def build_release_date_dic(git_path, libvers):
|
||||||
""""Build dictionary of release date with the tuple (library, version) as key."""
|
""""Build dictionary of release date with the tuple (library, version) as key."""
|
||||||
logging.info("Building release dictionary")
|
logging.info("Building release dictionary")
|
||||||
libverdates = []
|
libverdates = []
|
||||||
for libver in libvers:
|
for libver in libvers:
|
||||||
libverdates.append(get_release_triple(git_path, libver))
|
libverdates.append(get_release_triple(git_path, libver))
|
||||||
release_date_dic = {}
|
release_date_dic = {}
|
||||||
# with Pool(poolsize) as pool:
|
|
||||||
# libverdates = pool.map(partial(get_release_triple, git_path), libvers)
|
|
||||||
release_date_dic = {}
|
|
||||||
for (lib, ver, date) in libverdates:
|
for (lib, ver, date) in libverdates:
|
||||||
release_date_dic[(lib, ver)] = date
|
release_date_dic[(lib, ver)] = date
|
||||||
return release_date_dic
|
return release_date_dic
|
||||||
|
|
||||||
|
|
||||||
def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16):
|
def pull_and_update_db(cdnjs_git_path, create_csv):
|
||||||
"""Pull repo and update database."""
|
"""Pull repo and update database."""
|
||||||
logging.info("Pulling and updating data base")
|
logging.info("Pulling and updating data base")
|
||||||
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
|
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
|
||||||
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
||||||
del libvers
|
del libvers
|
||||||
gc.collect()
|
gc.collect()
|
||||||
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|
||||||
|
|
||||||
|
|
||||||
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16):
|
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
|
||||||
"""Update database (without pull) for files in listfile)"""
|
"""Update database (without pull) for files in listfile)"""
|
||||||
paths = []
|
paths = []
|
||||||
with open(listfile) as listfileobj:
|
with open(listfile) as listfileobj:
|
||||||
|
@ -352,15 +344,14 @@ def update_db_from_listfile(cdnjs_git_path, listfile, create_csv, poolsize=16):
|
||||||
files = files + path_files
|
files = files + path_files
|
||||||
logging.info("In total, found " + str(len(files)) + " files in " +
|
logging.info("In total, found " + str(len(files)) + " files in " +
|
||||||
str(len(libvers)) + " liberies/versions.")
|
str(len(libvers)) + " liberies/versions.")
|
||||||
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
||||||
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|
||||||
|
|
||||||
|
|
||||||
def update_db_all_libs(cdnjs_git_path,
|
def update_db_all_libs(cdnjs_git_path,
|
||||||
create_csv,
|
create_csv,
|
||||||
taskid=1,
|
taskid=1,
|
||||||
maxtaskid=1,
|
maxtaskid=1):
|
||||||
poolsize=16):
|
|
||||||
"""Update database entries for all libs in git repo."""
|
"""Update database entries for all libs in git repo."""
|
||||||
files, libvers = get_all_lib_files(cdnjs_git_path)
|
files, libvers = get_all_lib_files(cdnjs_git_path)
|
||||||
|
|
||||||
|
@ -379,7 +370,7 @@ def update_db_all_libs(cdnjs_git_path,
|
||||||
logging.info("This task has " + str(len(files)) + " files from " +
|
logging.info("This task has " + str(len(files)) + " files from " +
|
||||||
str(len(libvers)) + " library version(s).")
|
str(len(libvers)) + " library version(s).")
|
||||||
|
|
||||||
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
||||||
del libvers
|
del libvers
|
||||||
gc.collect()
|
gc.collect()
|
||||||
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|
||||||
|
|
|
@ -34,7 +34,6 @@ def helpmsg():
|
||||||
" -i initialize/update database with all libraries in the repository"
|
" -i initialize/update database with all libraries in the repository"
|
||||||
)
|
)
|
||||||
print(" -u update: pull repository and update database")
|
print(" -u update: pull repository and update database")
|
||||||
print(" -p n update n files in parallel")
|
|
||||||
print(
|
print(
|
||||||
" -l <PATHFILE> read list of libraries to update from file (recusively)"
|
" -l <PATHFILE> read list of libraries to update from file (recusively)"
|
||||||
)
|
)
|
||||||
|
@ -54,7 +53,6 @@ def main(argv):
|
||||||
verbose = False
|
verbose = False
|
||||||
initialize = False
|
initialize = False
|
||||||
update = False
|
update = False
|
||||||
parallel_updates = 5
|
|
||||||
taskid = 1
|
taskid = 1
|
||||||
listfile = None
|
listfile = None
|
||||||
maxtaskid = 1
|
maxtaskid = 1
|
||||||
|
@ -83,8 +81,6 @@ def main(argv):
|
||||||
update = True
|
update = True
|
||||||
elif opt == '-c':
|
elif opt == '-c':
|
||||||
csv = True
|
csv = True
|
||||||
elif opt == "-p":
|
|
||||||
parallel_updates = int(arg)
|
|
||||||
elif opt in ("-n", "--taskid"):
|
elif opt in ("-n", "--taskid"):
|
||||||
taskid = int(arg)
|
taskid = int(arg)
|
||||||
elif opt in ("-N", "--maxtaskid"):
|
elif opt in ("-N", "--maxtaskid"):
|
||||||
|
@ -105,17 +101,15 @@ def main(argv):
|
||||||
|
|
||||||
if initialize:
|
if initialize:
|
||||||
logging.info("Starting update of all db libs")
|
logging.info("Starting update of all db libs")
|
||||||
update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid,
|
update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid)
|
||||||
parallel_updates)
|
|
||||||
logging.info("Finished update of all db libs")
|
logging.info("Finished update of all db libs")
|
||||||
if update:
|
if update:
|
||||||
logging.info("Starting update of new db libs")
|
logging.info("Starting update of new db libs")
|
||||||
pull_and_update_db(cdnjs_git_path, csv, parallel_updates)
|
pull_and_update_db(cdnjs_git_path, csv)
|
||||||
logging.info("Finished update of new db libs")
|
logging.info("Finished update of new db libs")
|
||||||
if not listfile is None:
|
if not listfile is None:
|
||||||
logging.info("Starting update from list file")
|
logging.info("Starting update from list file")
|
||||||
update_db_from_listfile(cdnjs_git_path, listfile, csv,
|
update_db_from_listfile(cdnjs_git_path, listfile, csv)
|
||||||
parallel_updates)
|
|
||||||
logging.info("Finished update from list file")
|
logging.info("Finished update from list file")
|
||||||
|
|
||||||
logging.info("Successfully updated cdnjs table")
|
logging.info("Successfully updated cdnjs table")
|
||||||
|
|
|
@ -17,5 +17,5 @@ $SING_EXEC git pull >> $LOG
|
||||||
# $SING_EXEC pip3 install --system -e ../ExtensionCrawler
|
# $SING_EXEC pip3 install --system -e ../ExtensionCrawler
|
||||||
|
|
||||||
# Update cdnjs git repository and update cdnjs data base table
|
# Update cdnjs git repository and update cdnjs data base table
|
||||||
$SING_EXEC ./cdnjs-git-miner -v -p 1 -u -a /opt/archive >> $LOG
|
$SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue