Added database update for cdnjs.

2017-10-10 15:35:02 +01:00 · 2017-10-10 15:35:02 +01:00 · 6632cd0ded
parent d1f8de19c1
commit 6632cd0ded
2 changed files with 70 additions and 7 deletions
--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@ -32,6 +32,8 @@ import dateutil.parser
 import git

 from ExtensionCrawler.file_identifiers import get_file_identifiers
+from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
+import ExtensionCrawler.config as config


 def get_add_date(git_path, filename):
@ -74,7 +76,7 @@ def pull_list_changed_files(git_path):


 def hackish_pull_list_changed_files(git_path):
-    """Pull new updates from remote origin (hack, using git binary - 
+    """Pull new updates from remote origin (hack, using git binary -
       faster but not as safe as GitPython)."""
    git_repo = git.Repo(git_path)
    logging.info(" HEAD: " + str(git_repo.head.commit))
@ -182,8 +184,8 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
    return files, list(libvers)


-def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
-                             filename):
+def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
+                             con):
    """Update database for all file."""
    if os.path.isfile(filename):
        logging.info("Updating database for file " + filename)
@ -207,11 +209,53 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
                csv_writer.writeheader()
                csv_writer.writerow(file_info)
            else:
-                logging.info("Updating database (TODO) ...")
+                logging.info("Updating database ...")
+                for prefix, typ in [("", "AS_IS"), ("normalized_",
+                                                    "NORMALIZED"),
+                                    ("dec_", "DECOMPRESSED"),
+                                    ("dec_normalized_",
+                                     "DECOMPRESSED_NORMALIZED")]:
+                    if file_info[prefix + "md5"] is not None:
+                        con.insert(
+                            "cdnjs",
+                            md5=file_info[prefix + "md5"],
+                            sha1=file_info[prefix + "sha1"],
+                            sha256=file_info[prefix + "sha256"],
+                            size=file_info[prefix + "size"],
+                            loc=file_info[prefix + "loc"],
+                            description=file_info[prefix + "description"],
+                            encoding=file_info[prefix + "encoding"],
+                            mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None,
+                            mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None,
+                            path=file_info["path"],
+                            filename=file_info["filename"],
+                            add_date=file_info["add_date"],
+                            library=file_info["library"],
+                            version=file_info["version"],
+                            typ=typ)
+
    else:
        logging.info("Skipping update for deleted file " + filename)


+def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
+                                     filenames):
+    with MysqlBackend(
+            None,
+            read_default_file=config.const_mysql_config_file(),
+            charset='utf8mb4',
+            compress=True) as con:
+        for filename in filenames:
+            update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
+                            con)
+
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
 def update_database(create_csv,
                    release_dic,
                    cdnjs_git_path,
@ -220,8 +264,8 @@ def update_database(create_csv,
    """Update database for all files in files."""
    with Pool(poolsize) as pool:
        pool.map(
-            partial(update_database_for_file, create_csv, release_dic,
-                    cdnjs_git_path), files)
+            partial(update_database_for_file_chunked, create_csv, release_dic,
+                    cdnjs_git_path), chunks(list(files), 200))


 def get_release_triple(git_path, libver):
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@ -46,7 +46,7 @@ def normalize_jsdata(str_data):
 def get_data_identifiers(data):
    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
    magic_desc = ""
-    try:  
+    try:
        magic_desc = magic.from_buffer(data)
    except magic.MagicException as exp:
        rgx = re.compile(r' name use count.*$')
@ -72,11 +72,27 @@ def get_data_identifiers(data):
        normalized_data = None

    if normalized_data is None:
+        data_identifier['normalized_encoding'] = None
+        data_identifier['normalized_description'] = None
+        data_identifier['normalized_size'] = None
        data_identifier['normalized_loc'] = None
        data_identifier['normalized_md5'] = None
        data_identifier['normalized_sha1'] = None
        data_identifier['normalized_sha256'] = None
    else:
+        normalized_magic_desc = ""
+        try:
+            normalized_magic_desc = magic.from_buffer(normalized_data)
+        except magic.MagicException as exp:
+            rgx = re.compile(r' name use count.*$')
+            msg = str(exp.message)
+            if re.search(rgx, msg):
+                magic_desc = re.sub(rgx, '', msg)
+            else:
+                raise exp
+        data_identifier['normalized_encoding'] = chardet.detect(normalized_data)['encoding']
+        data_identifier['normalized_description'] = normalized_magic_desc
+        data_identifier['normalized_size'] = len(normalized_data)
        data_identifier['normalized_loc'] = normalized_loc
        data_identifier['normalized_md5'] = hashlib.md5(
            normalized_data).digest()
@ -128,6 +144,9 @@ def get_file_identifiers(path, data=None):
        'loc': data_identifier['loc'],
        'description': data_identifier['description'],
        'encoding': data_identifier['encoding'],
+        'normalized_encoding': data_identifier['normalized_encoding'],
+        'normalized_description': data_identifier['normalized_description'],
+        'normalized_size': data_identifier['normalized_size'],
        'normalized_loc': data_identifier['normalized_loc'],
        'normalized_md5': data_identifier['normalized_md5'],
        'normalized_sha1': data_identifier['normalized_sha1'],