Added database update for cdnjs.
This commit is contained in:
parent
d1f8de19c1
commit
6632cd0ded
|
@ -32,6 +32,8 @@ import dateutil.parser
|
|||
import git
|
||||
|
||||
from ExtensionCrawler.file_identifiers import get_file_identifiers
|
||||
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
|
||||
import ExtensionCrawler.config as config
|
||||
|
||||
|
||||
def get_add_date(git_path, filename):
|
||||
|
@ -74,7 +76,7 @@ def pull_list_changed_files(git_path):
|
|||
|
||||
|
||||
def hackish_pull_list_changed_files(git_path):
|
||||
"""Pull new updates from remote origin (hack, using git binary -
|
||||
"""Pull new updates from remote origin (hack, using git binary -
|
||||
faster but not as safe as GitPython)."""
|
||||
git_repo = git.Repo(git_path)
|
||||
logging.info(" HEAD: " + str(git_repo.head.commit))
|
||||
|
@ -182,8 +184,8 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
|
|||
return files, list(libvers)
|
||||
|
||||
|
||||
def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
|
||||
filename):
|
||||
def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
|
||||
con):
|
||||
"""Update database for all file."""
|
||||
if os.path.isfile(filename):
|
||||
logging.info("Updating database for file " + filename)
|
||||
|
@ -207,11 +209,53 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
|
|||
csv_writer.writeheader()
|
||||
csv_writer.writerow(file_info)
|
||||
else:
|
||||
logging.info("Updating database (TODO) ...")
|
||||
logging.info("Updating database ...")
|
||||
for prefix, typ in [("", "AS_IS"), ("normalized_",
|
||||
"NORMALIZED"),
|
||||
("dec_", "DECOMPRESSED"),
|
||||
("dec_normalized_",
|
||||
"DECOMPRESSED_NORMALIZED")]:
|
||||
if file_info[prefix + "md5"] is not None:
|
||||
con.insert(
|
||||
"cdnjs",
|
||||
md5=file_info[prefix + "md5"],
|
||||
sha1=file_info[prefix + "sha1"],
|
||||
sha256=file_info[prefix + "sha256"],
|
||||
size=file_info[prefix + "size"],
|
||||
loc=file_info[prefix + "loc"],
|
||||
description=file_info[prefix + "description"],
|
||||
encoding=file_info[prefix + "encoding"],
|
||||
mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None,
|
||||
mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None,
|
||||
path=file_info["path"],
|
||||
filename=file_info["filename"],
|
||||
add_date=file_info["add_date"],
|
||||
library=file_info["library"],
|
||||
version=file_info["version"],
|
||||
typ=typ)
|
||||
|
||||
else:
|
||||
logging.info("Skipping update for deleted file " + filename)
|
||||
|
||||
|
||||
def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
|
||||
filenames):
|
||||
with MysqlBackend(
|
||||
None,
|
||||
read_default_file=config.const_mysql_config_file(),
|
||||
charset='utf8mb4',
|
||||
compress=True) as con:
|
||||
for filename in filenames:
|
||||
update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
|
||||
con)
|
||||
|
||||
|
||||
def chunks(l, n):
|
||||
"""Yield successive n-sized chunks from l."""
|
||||
for i in range(0, len(l), n):
|
||||
yield l[i:i + n]
|
||||
|
||||
|
||||
def update_database(create_csv,
|
||||
release_dic,
|
||||
cdnjs_git_path,
|
||||
|
@ -220,8 +264,8 @@ def update_database(create_csv,
|
|||
"""Update database for all files in files."""
|
||||
with Pool(poolsize) as pool:
|
||||
pool.map(
|
||||
partial(update_database_for_file, create_csv, release_dic,
|
||||
cdnjs_git_path), files)
|
||||
partial(update_database_for_file_chunked, create_csv, release_dic,
|
||||
cdnjs_git_path), chunks(list(files), 200))
|
||||
|
||||
|
||||
def get_release_triple(git_path, libver):
|
||||
|
|
|
@ -46,7 +46,7 @@ def normalize_jsdata(str_data):
|
|||
def get_data_identifiers(data):
|
||||
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
|
||||
magic_desc = ""
|
||||
try:
|
||||
try:
|
||||
magic_desc = magic.from_buffer(data)
|
||||
except magic.MagicException as exp:
|
||||
rgx = re.compile(r' name use count.*$')
|
||||
|
@ -72,11 +72,27 @@ def get_data_identifiers(data):
|
|||
normalized_data = None
|
||||
|
||||
if normalized_data is None:
|
||||
data_identifier['normalized_encoding'] = None
|
||||
data_identifier['normalized_description'] = None
|
||||
data_identifier['normalized_size'] = None
|
||||
data_identifier['normalized_loc'] = None
|
||||
data_identifier['normalized_md5'] = None
|
||||
data_identifier['normalized_sha1'] = None
|
||||
data_identifier['normalized_sha256'] = None
|
||||
else:
|
||||
normalized_magic_desc = ""
|
||||
try:
|
||||
normalized_magic_desc = magic.from_buffer(normalized_data)
|
||||
except magic.MagicException as exp:
|
||||
rgx = re.compile(r' name use count.*$')
|
||||
msg = str(exp.message)
|
||||
if re.search(rgx, msg):
|
||||
magic_desc = re.sub(rgx, '', msg)
|
||||
else:
|
||||
raise exp
|
||||
data_identifier['normalized_encoding'] = chardet.detect(normalized_data)['encoding']
|
||||
data_identifier['normalized_description'] = normalized_magic_desc
|
||||
data_identifier['normalized_size'] = len(normalized_data)
|
||||
data_identifier['normalized_loc'] = normalized_loc
|
||||
data_identifier['normalized_md5'] = hashlib.md5(
|
||||
normalized_data).digest()
|
||||
|
@ -128,6 +144,9 @@ def get_file_identifiers(path, data=None):
|
|||
'loc': data_identifier['loc'],
|
||||
'description': data_identifier['description'],
|
||||
'encoding': data_identifier['encoding'],
|
||||
'normalized_encoding': data_identifier['normalized_encoding'],
|
||||
'normalized_description': data_identifier['normalized_description'],
|
||||
'normalized_size': data_identifier['normalized_size'],
|
||||
'normalized_loc': data_identifier['normalized_loc'],
|
||||
'normalized_md5': data_identifier['normalized_md5'],
|
||||
'normalized_sha1': data_identifier['normalized_sha1'],
|
||||
|
|
Loading…
Reference in New Issue