From 6632cd0ded1569dc35473f1ffa6bee6e8f7d4691 Mon Sep 17 00:00:00 2001
From: Michael Herzberg <mail@michael-herzberg.de>
Date: Tue, 10 Oct 2017 15:35:02 +0100
Subject: [PATCH] Added database update for cdnjs.

---
 ExtensionCrawler/cdnjs_git.py        | 56 +++++++++++++++++++++++++---
 ExtensionCrawler/file_identifiers.py | 21 ++++++++++-
 2 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/ExtensionCrawler/cdnjs_git.py b/ExtensionCrawler/cdnjs_git.py
index c45772e..104ee59 100644
--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@@ -32,6 +32,8 @@ import dateutil.parser
 import git
 
 from ExtensionCrawler.file_identifiers import get_file_identifiers
+from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
+import ExtensionCrawler.config as config
 
 
 def get_add_date(git_path, filename):
@@ -74,7 +76,7 @@ def pull_list_changed_files(git_path):
 
 
 def hackish_pull_list_changed_files(git_path):
-    """Pull new updates from remote origin (hack, using git binary - 
+    """Pull new updates from remote origin (hack, using git binary -
        faster but not as safe as GitPython)."""
     git_repo = git.Repo(git_path)
     logging.info(" HEAD: " + str(git_repo.head.commit))
@@ -182,8 +184,8 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
     return files, list(libvers)
 
 
-def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
-                             filename):
+def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
+                             con):
     """Update database for all file."""
     if os.path.isfile(filename):
         logging.info("Updating database for file " + filename)
@@ -207,11 +209,53 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
                 csv_writer.writeheader()
                 csv_writer.writerow(file_info)
             else:
-                logging.info("Updating database (TODO) ...")
+                logging.info("Updating database ...")
+                for prefix, typ in [("", "AS_IS"), ("normalized_",
+                                                    "NORMALIZED"),
+                                    ("dec_", "DECOMPRESSED"),
+                                    ("dec_normalized_",
+                                     "DECOMPRESSED_NORMALIZED")]:
+                    if file_info[prefix + "md5"] is not None:
+                        con.insert(
+                            "cdnjs",
+                            md5=file_info[prefix + "md5"],
+                            sha1=file_info[prefix + "sha1"],
+                            sha256=file_info[prefix + "sha256"],
+                            size=file_info[prefix + "size"],
+                            loc=file_info[prefix + "loc"],
+                            description=file_info[prefix + "description"],
+                            encoding=file_info[prefix + "encoding"],
+                            mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None,
+                            mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None,
+                            path=file_info["path"],
+                            filename=file_info["filename"],
+                            add_date=file_info["add_date"],
+                            library=file_info["library"],
+                            version=file_info["version"],
+                            typ=typ)
+
     else:
         logging.info("Skipping update for deleted file " + filename)
 
 
+def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
+                                     filenames):
+    with MysqlBackend(
+            None,
+            read_default_file=config.const_mysql_config_file(),
+            charset='utf8mb4',
+            compress=True) as con:
+        for filename in filenames:
+            update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
+                            con)
+
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
 def update_database(create_csv,
                     release_dic,
                     cdnjs_git_path,
@@ -220,8 +264,8 @@ def update_database(create_csv,
     """Update database for all files in files."""
     with Pool(poolsize) as pool:
         pool.map(
-            partial(update_database_for_file, create_csv, release_dic,
-                    cdnjs_git_path), files)
+            partial(update_database_for_file_chunked, create_csv, release_dic,
+                    cdnjs_git_path), chunks(list(files), 200))
 
 
 def get_release_triple(git_path, libver):
diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py
index 85d39dd..708dda5 100644
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@@ -46,7 +46,7 @@ def normalize_jsdata(str_data):
 def get_data_identifiers(data):
     """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
     magic_desc = ""
-    try:  
+    try:
         magic_desc = magic.from_buffer(data)
     except magic.MagicException as exp:
         rgx = re.compile(r' name use count.*$')
@@ -72,11 +72,27 @@ def get_data_identifiers(data):
         normalized_data = None
 
     if normalized_data is None:
+        data_identifier['normalized_encoding'] = None
+        data_identifier['normalized_description'] = None
+        data_identifier['normalized_size'] = None
         data_identifier['normalized_loc'] = None
         data_identifier['normalized_md5'] = None
         data_identifier['normalized_sha1'] = None
         data_identifier['normalized_sha256'] = None
     else:
+        normalized_magic_desc = ""
+        try:
+            normalized_magic_desc = magic.from_buffer(normalized_data)
+        except magic.MagicException as exp:
+            rgx = re.compile(r' name use count.*$')
+            msg = str(exp.message)
+            if re.search(rgx, msg):
+                magic_desc = re.sub(rgx, '', msg)
+            else:
+                raise exp
+        data_identifier['normalized_encoding'] = chardet.detect(normalized_data)['encoding']
+        data_identifier['normalized_description'] = normalized_magic_desc
+        data_identifier['normalized_size'] = len(normalized_data)
         data_identifier['normalized_loc'] = normalized_loc
         data_identifier['normalized_md5'] = hashlib.md5(
             normalized_data).digest()
@@ -128,6 +144,9 @@ def get_file_identifiers(path, data=None):
         'loc': data_identifier['loc'],
         'description': data_identifier['description'],
         'encoding': data_identifier['encoding'],
+        'normalized_encoding': data_identifier['normalized_encoding'],
+        'normalized_description': data_identifier['normalized_description'],
+        'normalized_size': data_identifier['normalized_size'],
         'normalized_loc': data_identifier['normalized_loc'],
         'normalized_md5': data_identifier['normalized_md5'],
         'normalized_sha1': data_identifier['normalized_sha1'],