Added database update for cdnjs.

This commit is contained in:
Michael Herzberg 2017-10-10 15:35:02 +01:00
parent d1f8de19c1
commit 6632cd0ded
2 changed files with 70 additions and 7 deletions

View File

@ -32,6 +32,8 @@ import dateutil.parser
import git
from ExtensionCrawler.file_identifiers import get_file_identifiers
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
import ExtensionCrawler.config as config
def get_add_date(git_path, filename):
@ -74,7 +76,7 @@ def pull_list_changed_files(git_path):
def hackish_pull_list_changed_files(git_path):
"""Pull new updates from remote origin (hack, using git binary -
"""Pull new updates from remote origin (hack, using git binary -
faster but not as safe as GitPython)."""
git_repo = git.Repo(git_path)
logging.info(" HEAD: " + str(git_repo.head.commit))
@ -182,8 +184,8 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
return files, list(libvers)
def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
filename):
def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
con):
"""Update database for all file."""
if os.path.isfile(filename):
logging.info("Updating database for file " + filename)
@ -207,11 +209,53 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
csv_writer.writeheader()
csv_writer.writerow(file_info)
else:
logging.info("Updating database (TODO) ...")
logging.info("Updating database ...")
for prefix, typ in [("", "AS_IS"), ("normalized_",
"NORMALIZED"),
("dec_", "DECOMPRESSED"),
("dec_normalized_",
"DECOMPRESSED_NORMALIZED")]:
if file_info[prefix + "md5"] is not None:
con.insert(
"cdnjs",
md5=file_info[prefix + "md5"],
sha1=file_info[prefix + "sha1"],
sha256=file_info[prefix + "sha256"],
size=file_info[prefix + "size"],
loc=file_info[prefix + "loc"],
description=file_info[prefix + "description"],
encoding=file_info[prefix + "encoding"],
mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None,
mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None,
path=file_info["path"],
filename=file_info["filename"],
add_date=file_info["add_date"],
library=file_info["library"],
version=file_info["version"],
typ=typ)
else:
logging.info("Skipping update for deleted file " + filename)
def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
filenames):
with MysqlBackend(
None,
read_default_file=config.const_mysql_config_file(),
charset='utf8mb4',
compress=True) as con:
for filename in filenames:
update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
con)
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def update_database(create_csv,
release_dic,
cdnjs_git_path,
@ -220,8 +264,8 @@ def update_database(create_csv,
"""Update database for all files in files."""
with Pool(poolsize) as pool:
pool.map(
partial(update_database_for_file, create_csv, release_dic,
cdnjs_git_path), files)
partial(update_database_for_file_chunked, create_csv, release_dic,
cdnjs_git_path), chunks(list(files), 200))
def get_release_triple(git_path, libver):

View File

@ -46,7 +46,7 @@ def normalize_jsdata(str_data):
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
magic_desc = ""
try:
try:
magic_desc = magic.from_buffer(data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
@ -72,11 +72,27 @@ def get_data_identifiers(data):
normalized_data = None
if normalized_data is None:
data_identifier['normalized_encoding'] = None
data_identifier['normalized_description'] = None
data_identifier['normalized_size'] = None
data_identifier['normalized_loc'] = None
data_identifier['normalized_md5'] = None
data_identifier['normalized_sha1'] = None
data_identifier['normalized_sha256'] = None
else:
normalized_magic_desc = ""
try:
normalized_magic_desc = magic.from_buffer(normalized_data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
magic_desc = re.sub(rgx, '', msg)
else:
raise exp
data_identifier['normalized_encoding'] = chardet.detect(normalized_data)['encoding']
data_identifier['normalized_description'] = normalized_magic_desc
data_identifier['normalized_size'] = len(normalized_data)
data_identifier['normalized_loc'] = normalized_loc
data_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
@ -128,6 +144,9 @@ def get_file_identifiers(path, data=None):
'loc': data_identifier['loc'],
'description': data_identifier['description'],
'encoding': data_identifier['encoding'],
'normalized_encoding': data_identifier['normalized_encoding'],
'normalized_description': data_identifier['normalized_description'],
'normalized_size': data_identifier['normalized_size'],
'normalized_loc': data_identifier['normalized_loc'],
'normalized_md5': data_identifier['normalized_md5'],
'normalized_sha1': data_identifier['normalized_sha1'],