Added csv output for debugging.

This commit is contained in:
Achim D. Brucker 2017-09-16 13:21:49 +01:00
parent de6dde5269
commit c274b96f66
2 changed files with 54 additions and 22 deletions

View File

@ -29,6 +29,8 @@ import zlib
from functools import partial, reduce
from io import StringIO
from multiprocessing import Pool
import csv
import sys
import cchardet as chardet
import dateutil.parser
@ -296,23 +298,46 @@ def get_all_lib_files(cdnjs_git_path):
return files, list(libvers)
def update_database_for_file(release_dic, cdnjs_git_path, filename):
def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
filename):
"""Update database for all file."""
if os.path.isfile(filename):
logging.info("Updating database for file " + filename)
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
if not file_info is None:
## TODO
logging.info("Updating database ...")
if create_csv:
print(file_info['path'])
print(cdnjs_git_path)
file_info['path'] = re.sub(r'^.*\/ajax\/', 'ajax/',
file_info['path'])
for key in [
'md5', 'sha1', 'sha256', 'normalized_md5',
'normalized_sha1', 'normalized_sha256',
'dec_normalized_md5', 'dec_normalized_sha1',
'dec_normalized_sha256', 'dec_md5', 'dec_sha1',
'dec_sha256'
]:
if not file_info[key] is None:
file_info[key] = (file_info[key]).hex()
csv_writer = csv.DictWriter(sys.stdout, file_info.keys())
csv_writer.writeheader()
csv_writer.writerow(file_info)
else:
logging.info("Updating database (TODO) ...")
else:
logging.info("Skipping update for deleted file " + filename)
def update_database(release_dic, cdnjs_git_path, files, poolsize=16):
def update_database(create_csv,
release_dic,
cdnjs_git_path,
files,
poolsize=16):
"""Update database for all files in files."""
with Pool(poolsize) as pool:
pool.map(
partial(update_database_for_file, release_dic, cdnjs_git_path),
files)
partial(update_database_for_file, create_csv, release_dic,
cdnjs_git_path), files)
def get_release_triple(git_path, libver):
@ -335,16 +360,21 @@ def build_release_date_dic(git_path, libvers, poolsize=16):
return release_date_dic
def pull_and_update_db(cdnjs_git_path, poolsize=16):
def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16):
"""Pull repo and update database."""
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
del libvers
gc.collect()
update_database(release_dic, cdnjs_git_path, files, poolsize)
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16):
def update_db_all_libs(cdnjs_git_path,
create_csv,
taskid=1,
maxtaskid=1,
poolsize=16):
"""Update database entries for all libs in git repo."""
files, libvers = get_all_lib_files(cdnjs_git_path)
@ -366,4 +396,4 @@ def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16):
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
del libvers
gc.collect()
update_database(release_dic, cdnjs_git_path, files, poolsize)
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)

View File

@ -32,15 +32,14 @@ assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def helpmsg():
"""Print help message."""
print("cdnjs-git-miner [OPTION]")
print(
" -i initialize/update database with all libraries in the repository"
)
print(" -u update: pull repository and update database")
print(" -p n update n files in parallel")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
print(" -v verbose")
print(" -h print this help text")
print(" -i initialize/update database with all libraries in the repository")
print(" -u update: pull repository and update database")
print(" -p n update n files in parallel")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
print(" -v verbose")
print(" -c print csv format to stdout instead of writing to database")
print(" -h print this help text")
def main(argv):
@ -52,9 +51,10 @@ def main(argv):
parallel_updates = 5
taskid = 1
maxtaskid = 1
csv = False
try:
opts, args = getopt.getopt(argv, "hviup:n:N:",
opts, args = getopt.getopt(argv, "hvicup:n:N:",
["taskid=", "maxtaskid="])
except getopt.GetoptError:
helpmsg()
@ -69,6 +69,8 @@ def main(argv):
initialize = True
elif opt == '-u':
update = True
elif opt == '-c':
csv = True
elif opt == "-p":
parallel_updates = int(arg)
elif opt in ("-n", "--taskid"):
@ -90,9 +92,9 @@ def main(argv):
cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
if initialize:
update_db_all_libs(cdnjs_git_path, taskid, maxtaskid, parallel_updates)
update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid, parallel_updates)
if update:
pull_and_update_db(cdnjs_git_path, parallel_updates)
pull_and_update_db(cdnjs_git_path, csv, parallel_updates)
if __name__ == "__main__":