From 6632cd0ded1569dc35473f1ffa6bee6e8f7d4691 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Tue, 10 Oct 2017 15:35:02 +0100 Subject: [PATCH] Added database update for cdnjs. --- ExtensionCrawler/cdnjs_git.py | 56 +++++++++++++++++++++++++--- ExtensionCrawler/file_identifiers.py | 21 ++++++++++- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/ExtensionCrawler/cdnjs_git.py b/ExtensionCrawler/cdnjs_git.py index c45772e..104ee59 100644 --- a/ExtensionCrawler/cdnjs_git.py +++ b/ExtensionCrawler/cdnjs_git.py @@ -32,6 +32,8 @@ import dateutil.parser import git from ExtensionCrawler.file_identifiers import get_file_identifiers +from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend +import ExtensionCrawler.config as config def get_add_date(git_path, filename): @@ -74,7 +76,7 @@ def pull_list_changed_files(git_path): def hackish_pull_list_changed_files(git_path): - """Pull new updates from remote origin (hack, using git binary - + """Pull new updates from remote origin (hack, using git binary - faster but not as safe as GitPython).""" git_repo = git.Repo(git_path) logging.info(" HEAD: " + str(git_repo.head.commit)) @@ -182,8 +184,8 @@ def get_all_lib_files(cdnjs_git_path, localpath=None): return files, list(libvers) -def update_database_for_file(create_csv, release_dic, cdnjs_git_path, - filename): +def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename, + con): """Update database for all file.""" if os.path.isfile(filename): logging.info("Updating database for file " + filename) @@ -207,11 +209,53 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, csv_writer.writeheader() csv_writer.writerow(file_info) else: - logging.info("Updating database (TODO) ...") + logging.info("Updating database ...") + for prefix, typ in [("", "AS_IS"), ("normalized_", + "NORMALIZED"), + ("dec_", "DECOMPRESSED"), + ("dec_normalized_", + "DECOMPRESSED_NORMALIZED")]: + if file_info[prefix + "md5"] is not None: + con.insert( + "cdnjs", + md5=file_info[prefix + "md5"], + sha1=file_info[prefix + "sha1"], + sha256=file_info[prefix + "sha256"], + size=file_info[prefix + "size"], + loc=file_info[prefix + "loc"], + description=file_info[prefix + "description"], + encoding=file_info[prefix + "encoding"], + mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None, + mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None, + path=file_info["path"], + filename=file_info["filename"], + add_date=file_info["add_date"], + library=file_info["library"], + version=file_info["version"], + typ=typ) + else: logging.info("Skipping update for deleted file " + filename) +def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, + filenames): + with MysqlBackend( + None, + read_default_file=config.const_mysql_config_file(), + charset='utf8mb4', + compress=True) as con: + for filename in filenames: + update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename, + con) + + +def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + + def update_database(create_csv, release_dic, cdnjs_git_path, @@ -220,8 +264,8 @@ def update_database(create_csv, """Update database for all files in files.""" with Pool(poolsize) as pool: pool.map( - partial(update_database_for_file, create_csv, release_dic, - cdnjs_git_path), files) + partial(update_database_for_file_chunked, create_csv, release_dic, + cdnjs_git_path), chunks(list(files), 200)) def get_release_triple(git_path, libver): diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py index 85d39dd..708dda5 100644 --- a/ExtensionCrawler/file_identifiers.py +++ b/ExtensionCrawler/file_identifiers.py @@ -46,7 +46,7 @@ def normalize_jsdata(str_data): def get_data_identifiers(data): """Get basic data identifiers (size, hashes, normalized hashes, etc.).""" magic_desc = "" - try: + try: magic_desc = magic.from_buffer(data) except magic.MagicException as exp: rgx = re.compile(r' name use count.*$') @@ -72,11 +72,27 @@ def get_data_identifiers(data): normalized_data = None if normalized_data is None: + data_identifier['normalized_encoding'] = None + data_identifier['normalized_description'] = None + data_identifier['normalized_size'] = None data_identifier['normalized_loc'] = None data_identifier['normalized_md5'] = None data_identifier['normalized_sha1'] = None data_identifier['normalized_sha256'] = None else: + normalized_magic_desc = "" + try: + normalized_magic_desc = magic.from_buffer(normalized_data) + except magic.MagicException as exp: + rgx = re.compile(r' name use count.*$') + msg = str(exp.message) + if re.search(rgx, msg): + magic_desc = re.sub(rgx, '', msg) + else: + raise exp + data_identifier['normalized_encoding'] = chardet.detect(normalized_data)['encoding'] + data_identifier['normalized_description'] = normalized_magic_desc + data_identifier['normalized_size'] = len(normalized_data) data_identifier['normalized_loc'] = normalized_loc data_identifier['normalized_md5'] = hashlib.md5( normalized_data).digest() @@ -128,6 +144,9 @@ def get_file_identifiers(path, data=None): 'loc': data_identifier['loc'], 'description': data_identifier['description'], 'encoding': data_identifier['encoding'], + 'normalized_encoding': data_identifier['normalized_encoding'], + 'normalized_description': data_identifier['normalized_description'], + 'normalized_size': data_identifier['normalized_size'], 'normalized_loc': data_identifier['normalized_loc'], 'normalized_md5': data_identifier['normalized_md5'], 'normalized_sha1': data_identifier['normalized_sha1'],